diff --git a/.github/labeler.yml b/.github/labeler.yml
index 90cdda4d3ca..8506d38a048 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ cudf.polars:
   - 'python/cudf_polars/**'
 
 pylibcudf:
-  - 'python/cudf/pylibcudf/**'
+  - 'python/pylibcudf/**'
 
 libcudf:
   - 'cpp/**'
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b515dbff9f3..af1538ad0c1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,6 +30,7 @@ jobs:
       - wheel-tests-cudf
       - wheel-build-cudf-polars
       - wheel-tests-cudf-polars
+      - cudf-polars-polars-tests
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -244,6 +245,17 @@ jobs:
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
       script: "ci/test_wheel_cudf_polars.sh"
+  cudf-polars-polars-tests:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars_polars_tests.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh
new file mode 100755
index 00000000000..52a827af94c
--- /dev/null
+++ b/ci/run_cudf_polars_polars_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+# Assumption, polars has been cloned in the root of the repo.
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/
+
+DESELECTED_TESTS=(
+    "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
+    "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
+    "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
+    "tests/docs/test_user_guide.py" # No dot binary in CI image
+)
+
+DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
+python -m pytest \
+       --import-mode=importlib \
+       --cache-clear \
+       -m "" \
+       -p cudf_polars.testing.plugin \
+       -v \
+       --tb=short \
+       ${DESELECTED_TESTS} \
+       "$@" \
+       py-polars/tests
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
new file mode 100755
index 00000000000..6c728a9537f
--- /dev/null
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
+else
+    HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
+fi
+
+rapids-logger "Download wheels"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+
+# Download the pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+
+rapids-logger "Install pylibcudf"
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+
+rapids-logger "Install cudf_polars"
+python -m pip install $(echo ./dist/cudf_polars*.whl)
+
+# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
+TAG="py-1.7.0"
+rapids-logger "Clone polars to ${TAG}"
+git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
+
+# Install requirements for running polars tests
+rapids-logger "Install polars test requirements"
+python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+rapids-logger "Run polars tests"
+./ci/run_cudf_polars_polars_tests.sh
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Running polars test suite PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 9844090258a..b4509bba02e 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -13,10 +13,14 @@ set -eou pipefail
 if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
 then
     HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
 else
     HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
 fi
 
+rapids-logger "Download wheels"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
@@ -43,6 +47,9 @@ python -m pip install \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
+rapids-logger "Pin to 1.7.0 Temporarily"
+python -m pip install polars==1.7.0
+
 rapids-logger "Run cudf_polars tests"
 
 function set_exitcode()
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7bc01e64441..26c086046a8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -378,6 +378,7 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
+  src/io/json/host_tree_algorithms.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
index d4368906702..54d177df401 100644
--- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
@@ -85,7 +85,7 @@ std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_inpu
   auto const left_selected  = left_input.select(left_on);
   auto const right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr);
+    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr);
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 2d514764fc2..62116ddf661 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <cstdlib>
 #include <ctime>
@@ -146,11 +147,15 @@ std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
                                              cudf::null_equality compare_nulls)
 {
   CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+  constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected  = left_input.select(left_on);
+  auto const right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    cudf::inner_join(left_selected,
+                     right_selected,
+                     compare_nulls,
+                     cudf::get_default_stream(),
+                     cudf::get_current_device_resource_ref());
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 442731694fa..9cda22d0695 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -18,6 +18,8 @@
 
 #include "../utilities/timer.hpp"
 
+#include <cudf/utilities/default_stream.hpp>
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -159,8 +161,11 @@ int main(int argc, char const** argv)
     // Left anti-join the original and transcoded tables
     // identical tables should not throw an exception and
     // return an empty indices vector
-    auto const indices = cudf::left_anti_join(
-      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+    auto const indices = cudf::left_anti_join(input->view(),
+                                              transcoded_input->view(),
+                                              cudf::null_equality::EQUAL,
+                                              cudf::get_default_stream(),
+                                              resource.get());
 
     // No exception thrown, check indices
     auto const valid = indices->size() == 0;
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index c7523c80b2b..7359a0d5fde 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -17,9 +17,12 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 /**
@@ -40,6 +43,7 @@ namespace datetime {
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t years
@@ -47,6 +51,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -54,6 +59,7 @@ std::unique_ptr<cudf::column> extract_year(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t months
@@ -61,6 +67,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -68,6 +75,7 @@ std::unique_ptr<cudf::column> extract_month(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -75,6 +83,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -82,6 +91,7 @@ std::unique_ptr<cudf::column> extract_day(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t days
@@ -89,6 +99,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -96,6 +107,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t hours
@@ -103,6 +115,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -110,6 +123,7 @@ std::unique_ptr<cudf::column> extract_hour(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t minutes
@@ -117,6 +131,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -124,6 +139,7 @@ std::unique_ptr<cudf::column> extract_minute(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t seconds
@@ -131,6 +147,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -141,6 +158,7 @@ std::unique_ptr<cudf::column> extract_second(
  * For example, the millisecond fraction of 1.234567890 seconds is 234.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t milliseconds
@@ -148,6 +166,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -158,6 +177,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  * For example, the microsecond fraction of 1.234567890 seconds is 567.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t microseconds
@@ -165,6 +185,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -175,6 +196,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  * For example, the nanosecond fraction of 1.234567890 seconds is 890.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of the extracted int16_t nanoseconds
@@ -182,6 +204,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
@@ -196,6 +219,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS
@@ -203,6 +227,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -210,6 +235,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  * returns an int16_t cudf::column. The value is between [1, {365-366}]
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype INT16 containing the day number since the start of the year
@@ -217,6 +243,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -245,6 +272,7 @@ std::unique_ptr<cudf::column> day_of_year(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::column_view of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of timestamp type containing the computed timestamps
@@ -252,6 +280,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -280,6 +309,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  *
  * @param timestamps cudf::column_view of timestamp type
  * @param months cudf::scalar of integer type containing the number of months to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @return cudf::column of timestamp type containing the computed timestamps
@@ -287,6 +317,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -297,6 +328,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  * `output[i] is null` if `column[i]` is null
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @returns cudf::column of datatype BOOL8 truth value of the corresponding date
@@ -304,6 +336,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -315,11 +348,13 @@ std::unique_ptr<cudf::column> is_leap_year(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column cudf::column_view of the input datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return cudf::column of datatype INT16 of days in month of the corresponding date
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -331,11 +366,13 @@ std::unique_ptr<cudf::column> days_in_month(
  * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
  *
  * @param column The input column containing datetime values
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  * @return A column of INT16 type indicating which quarter the date is in
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t {
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round up to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -372,6 +411,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round down to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -380,6 +420,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -387,6 +428,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
  *
  * @param column cudf::column_view of the input datetime values
  * @param freq rounding_frequency indicating the frequency to round to
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the returned column
  *
  * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
@@ -395,6 +437,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 31782cbaf8a..9db7e48498f 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
                                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
+ * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view,
  * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
@@ -138,9 +135,8 @@ std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
@@ -149,9 +145,8 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
@@ -159,9 +154,9 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
                                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index 5738f9ec8e9..f51d1ba42b2 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/timezone.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -26,14 +27,13 @@ namespace detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::cuda_stream_view stream,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index cc8912cb022..a590eb27511 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -97,6 +97,7 @@ class distinct_hash_join;
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -108,6 +109,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
            null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::cuda_stream_view stream      = cudf::get_default_stream(),
            rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -148,6 +151,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys,
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -187,6 +192,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys,
  * @param left_keys The left table
  * @param right_keys The right table
  * @param compare_nulls Controls whether null join-key values should match or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct
@@ -215,6 +222,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -236,6 +244,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
  * @param[in] right_keys The right table
  * @param[in] compare_nulls controls whether null join-key values
  * should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A column `left_indices` that can be used to construct
@@ -246,6 +255,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -266,6 +276,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
  *
  * @param left  The left table
  * @param right The right table
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate the returned table's device memory
  *
  * @return     Result of cross joining `left` and `right` tables
@@ -273,6 +284,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -567,6 +579,7 @@ class distinct_hash_join {
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size = {},
+                       rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -623,6 +638,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
+                      rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -655,6 +671,7 @@ conditional_left_join(table_view const& left,
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -665,6 +682,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -693,6 +711,7 @@ conditional_full_join(table_view const& left,
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -704,6 +723,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -732,6 +752,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * @param right The right table
  * @param binary_predicate The condition on which to join
  * @param output_size Optional value which allows users to specify the exact output size
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A vector `left_indices` that can be used to construct the result of
@@ -743,6 +764,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
@@ -786,6 +808,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_inner_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -801,6 +824,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -846,6 +870,7 @@ mixed_inner_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_left_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -861,6 +886,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -906,6 +932,7 @@ mixed_left_join(
  * @param output_size_data An optional pair of values indicating the exact output size and the
  * number of matches for each row in the larger of the two input tables, left or right (may be
  * precomputed using the corresponding mixed_full_join_size API).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -921,6 +948,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -956,6 +984,7 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -968,6 +997,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1004,6 +1034,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1016,6 +1047,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1041,6 +1073,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1056,6 +1089,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1081,6 +1115,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair containing the size that would result from performing the
@@ -1096,6 +1131,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1111,6 +1147,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
@@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size(
  * @param left The left table
  * @param right The right table
  * @param binary_predicate The condition on which to join
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return The size that would result from performing the requested join
@@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index aa903770e26..f6de1056c24 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -15,9 +15,12 @@
  */
 #pragma once
 
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <optional>
 #include <string>
@@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
  *
  * @param tzif_dir The directory where the TZif files are located
  * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory.
  *
  * @return The transition table for the given timezone
@@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index fd9a6b8f5fe..ddb0dbcd96d 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -580,142 +580,167 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr);
 }
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(
-    detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
+  return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_year(column, cudf::get_default_stream(), mr);
+  return detail::extract_year(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_month(column, cudf::get_default_stream(), mr);
+  return detail::extract_month(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_day(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_day(column, cudf::get_default_stream(), mr);
+  return detail::extract_day(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_weekday(column, cudf::get_default_stream(), mr);
+  return detail::extract_weekday(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_hour(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_hour(column, cudf::get_default_stream(), mr);
+  return detail::extract_hour(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_minute(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_minute(column, cudf::get_default_stream(), mr);
+  return detail::extract_minute(column, stream, mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> extract_second(column_view const& column,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_second(column, cudf::get_default_stream(), mr);
+  return detail::extract_second(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_millisecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_microsecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
+  return detail::extract_nanosecond_fraction(column, stream, mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
+  return detail::last_day_of_month(column, stream, mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> day_of_year(column_view const& column,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::day_of_year(column, cudf::get_default_stream(), mr);
+  return detail::day_of_year(column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(
-    timestamp_column, months_column, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months_column, stream, mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
+  return detail::add_calendrical_months(timestamp_column, months, stream, mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> is_leap_year(column_view const& column,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_leap_year(column, cudf::get_default_stream(), mr);
+  return detail::is_leap_year(column, stream, mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> days_in_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::days_in_month(column, cudf::get_default_stream(), mr);
+  return detail::days_in_month(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_quarter(column, cudf::get_default_stream(), mr);
+  return detail::extract_quarter(column, stream, mr);
 }
 
 }  // namespace datetime
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 6498a5e6c55..cf239297255 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::make_timezone_transition_table(
-    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+  return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr);
 }
 
 namespace detail {
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
new file mode 100644
index 00000000000..70d61132b42
--- /dev/null
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -0,0 +1,808 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/parsing_utils.cuh"
+#include "io/utilities/string_parsing.hpp"
+#include "nested_json.hpp"
+
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/uninitialized_fill.h>
+
+#include <algorithm>
+
+namespace cudf::io::json::detail {
+
+/**
+ * @brief Get the column indices for the values column for array of arrays rows
+ *
+ * @param row_array_children_level The level of the row array's children
+ * @param d_tree The tree metadata
+ * @param col_ids The column ids
+ * @param num_columns The number of columns
+ * @param stream The stream to use
+ * @return The value columns' indices
+ */
+rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
+                                                          tree_meta_t const& d_tree,
+                                                          device_span<NodeIndexT> col_ids,
+                                                          size_type const num_columns,
+                                                          rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [level2_nodes, level2_indices] = get_array_children_indices(
+    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
+  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
+  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
+  thrust::scatter(rmm::exec_policy(stream),
+                  level2_indices.begin(),
+                  level2_indices.end(),
+                  col_id_location,
+                  values_column_indices.begin());
+  return values_column_indices;
+}
+
+/**
+ * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
+ *
+ * @param input String device buffer
+ * @param node_range_begin Begin offset of the strings
+ * @param node_range_end End offset of the strings
+ * @param stream CUDA stream
+ * @return Vector of strings
+ */
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto const num_strings = node_range_begin.size();
+  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
+  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
+  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
+  thrust::transform(rmm::exec_policy(stream),
+                    d_offset_pairs,
+                    d_offset_pairs + num_strings,
+                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
+                    [] __device__(auto const& offsets) {
+                      // Note: first character for non-field columns
+                      return thrust::make_tuple(
+                        static_cast<size_type>(thrust::get<0>(offsets)),
+                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
+                    });
+
+  cudf::io::parse_options_view options_view{};
+  options_view.quotechar  = '\0';  // no quotes
+  options_view.keepquotes = true;
+  auto d_offset_length_it =
+    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
+  auto d_column_names = parse_data(input.data(),
+                                   d_offset_length_it,
+                                   num_strings,
+                                   data_type{type_id::STRING},
+                                   rmm::device_buffer{},
+                                   0,
+                                   options_view,
+                                   stream,
+                                   cudf::get_current_device_resource_ref());
+  auto to_host        = [stream](auto const& col) {
+    if (col.is_empty()) return std::vector<std::string>{};
+    auto const scv     = cudf::strings_column_view(col);
+    auto const h_chars = cudf::detail::make_host_vector_async<char>(
+      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
+    auto const h_offsets = cudf::detail::make_host_vector_async(
+      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
+                                               scv.size() + 1),
+      stream);
+    stream.synchronize();
+
+    // build std::string vector from chars and offsets
+    std::vector<std::string> host_data;
+    host_data.reserve(col.size());
+    std::transform(
+      std::begin(h_offsets),
+      std::end(h_offsets) - 1,
+      std::begin(h_offsets) + 1,
+      std::back_inserter(host_data),
+      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
+    return host_data;
+  };
+  return to_host(d_column_names->view());
+}
+
+/**
+ * @brief Checks if all strings in each string column in the tree are nulls.
+ * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
+ * false.
+ *
+ * @param input Input JSON string device data
+ * @param d_column_tree column tree representation of JSON string
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Array of bytes where each byte indicate if it is all nulls string column.
+ */
+rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
+                                                      tree_meta_t const& d_column_tree,
+                                                      tree_meta_t const& tree,
+                                                      device_span<NodeIndexT> col_ids,
+                                                      cudf::io::json_reader_options const& options,
+                                                      rmm::cuda_stream_view stream)
+{
+  auto const num_nodes = col_ids.size();
+  auto const num_cols  = d_column_tree.node_categories.size();
+  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
+  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
+
+  auto parse_opt = parsing_options(options, stream);
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [options           = parse_opt.view(),
+     data              = input.data(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
+      auto const node_category = column_categories[col_ids[i]];
+      if (node_category == NC_STR or node_category == NC_VAL) {
+        auto const is_null_literal = serialized_trie_contains(
+          options.trie_na,
+          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
+        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
+      }
+    });
+  return is_all_nulls;
+}
+
+NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT> col_ids,
+                                       bool is_enabled_lines,
+                                       rmm::cuda_stream_view stream)
+{
+  NodeIndexT value = parent_node_sentinel;
+  if (!col_ids.empty()) {
+    auto const list_node_index = is_enabled_lines ? 0 : 1;
+    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
+                                  col_ids.data() + list_node_index,
+                                  sizeof(NodeIndexT),
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    stream.synchronize();
+  }
+  return value;
+}
+/**
+ * @brief Holds member data pointers of `d_json_column`
+ *
+ */
+struct json_column_data {
+  using row_offset_t = json_column::row_offset_t;
+  row_offset_t* string_offsets;
+  row_offset_t* string_lengths;
+  row_offset_t* child_offsets;
+  bitmask_type* validity;
+};
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr);
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream);
+
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+
+  bool const is_enabled_lines                 = options.is_enabled_lines();
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto const num_nodes                        = col_ids.size();
+  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
+  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
+
+  // sort by {col_id} on {node_ids} stable
+  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
+  thrust::stable_sort_by_key(
+    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
+
+  NodeIndexT const row_array_parent_col_id =
+    get_row_array_parent_col_id(col_ids, is_enabled_lines, stream);
+
+  // 1. gather column information.
+  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
+    reduce_to_column_tree(tree,
+                          col_ids,
+                          sorted_col_ids,
+                          node_ids,
+                          row_offsets,
+                          is_array_of_arrays,
+                          row_array_parent_col_id,
+                          stream);
+  auto num_columns                      = d_unique_col_ids.size();
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
+    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // array of arrays column names
+  if (is_array_of_arrays) {
+    auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+    auto const column_parent_ids =
+      cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
+    auto values_column_indices =
+      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
+    auto h_values_column_indices =
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
+    std::transform(unique_col_ids.begin(),
+                   unique_col_ids.end(),
+                   column_names.begin(),
+                   column_names.begin(),
+                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
+                     auto col_id, auto name) mutable {
+                     return column_parent_ids[col_id] == row_array_parent_col_id
+                              ? std::to_string(h_values_column_indices[col_id])
+                              : name;
+                   });
+  }
+
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_std_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return std::vector<uint8_t>();
+  }();
+  auto [ignore_vals, columns] = build_tree(root,
+                                           is_str_column_all_nulls,
+                                           d_column_tree,
+                                           d_unique_col_ids,
+                                           d_max_row_offsets,
+                                           column_names,
+                                           row_array_parent_col_id,
+                                           is_array_of_arrays,
+                                           options,
+                                           stream,
+                                           mr);
+
+  scatter_offsets(tree,
+                  col_ids,
+                  row_offsets,
+                  node_ids,
+                  sorted_col_ids,
+                  d_column_tree,
+                  ignore_vals,
+                  columns,
+                  stream);
+}
+
+std::pair<cudf::detail::host_vector<uint8_t>,
+          std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>>
+build_tree(device_json_column& root,
+           std::vector<uint8_t> const& is_str_column_all_nulls,
+           tree_meta_t& d_column_tree,
+           device_span<NodeIndexT const> d_unique_col_ids,
+           device_span<size_type const> d_max_row_offsets,
+           std::vector<std::string> const& column_names,
+           NodeIndexT row_array_parent_col_id,
+           bool is_array_of_arrays,
+           cudf::io::json_reader_options const& options,
+           rmm::cuda_stream_view stream,
+           rmm::device_async_resource_ref mr)
+{
+  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
+  auto column_categories =
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
+  auto column_range_beg =
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
+  auto num_columns           = d_unique_col_ids.size();
+
+  auto to_json_col_type = [](auto category) {
+    switch (category) {
+      case NC_STRUCT: return json_col_t::StructColumn;
+      case NC_LIST: return json_col_t::ListColumn;
+      case NC_STR: [[fallthrough]];
+      case NC_VAL: return json_col_t::StringColumn;
+      default: return json_col_t::Unknown;
+    }
+  };
+  auto init_to_zero = [stream](auto& v) {
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
+  };
+
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
+      return;
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
+      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+      init_to_zero(col.string_offsets);
+      init_to_zero(col.string_lengths);
+    } else if (column_category == NC_LIST) {
+      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
+      init_to_zero(col.child_offsets);
+    }
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = to_json_col_type(column_category);
+  };
+
+  auto reinitialize_as_string = [&](auto i, auto& col) {
+    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+    init_to_zero(col.string_offsets);
+    init_to_zero(col.string_lengths);
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = json_col_t::StringColumn;
+    // destroy references of all child columns after this step, by calling remove_child_columns
+  };
+
+  path_from_tree tree_path{column_categories,
+                           column_parent_ids,
+                           column_names,
+                           is_array_of_arrays,
+                           row_array_parent_col_id};
+
+  // 2. generate nested columns tree and its device_memory
+  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
+  auto h_range_col_id_it =
+    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<0>(a) < thrust::get<0>(b);
+  });
+
+  // use hash map because we may skip field name's col_ids
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
+  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
+  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
+  // find column_ids which are values, but should be ignored in validity
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
+  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
+  columns.try_emplace(parent_node_sentinel, std::ref(root));
+
+  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
+    [&](NodeIndexT this_col_id, device_json_column& col) {
+      for (auto col_name : col.column_order) {
+        auto child_id                  = mapped_columns[{this_col_id, col_name}];
+        is_mixed_type_column[child_id] = 1;
+        remove_child_columns(child_id, col.child_columns.at(col_name));
+        mapped_columns.erase({this_col_id, col_name});
+        columns.erase(child_id);
+      }
+      col.child_columns.clear();  // their references are deleted above.
+      col.column_order.clear();
+    };
+
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
+    std::string name   = "";
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
+      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
+        name = column_names[this_col_id];
+      } else {
+        name = list_child_name;
+      }
+    } else if (column_categories[parent_col_id] == NC_FN) {
+      auto field_name_col_id = parent_col_id;
+      parent_col_id          = column_parent_ids[parent_col_id];
+      name                   = column_names[field_name_col_id];
+    } else {
+      CUDF_FAIL("Unexpected parent column category");
+    }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
+      continue;
+    }
+
+    // If the child is already found,
+    // replace if this column is a nested column and the existing was a value column
+    // ignore this column if this column is a value column and the existing was a nested column
+    auto it = columns.find(parent_col_id);
+    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
+    auto& parent_col = it->second.get();
+    bool replaced    = false;
+    if (mapped_columns.count({parent_col_id, name}) > 0) {
+      auto const old_col_id = mapped_columns[{parent_col_id, name}];
+      // If mixed type as string is enabled, make both of them strings and merge them.
+      // All child columns will be ignored when parsing.
+      if (is_enabled_mixed_types_as_string) {
+        bool const is_mixed_type = [&]() {
+          // If new or old is STR and they are all not null, make it mixed type, else ignore.
+          if (column_categories[this_col_id] == NC_VAL ||
+              column_categories[this_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[this_col_id]) return false;
+          }
+          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+            if (is_str_column_all_nulls[old_col_id]) return false;
+          }
+          return true;
+        }();
+        if (is_mixed_type) {
+          is_mixed_type_column[this_col_id] = 1;
+          is_mixed_type_column[old_col_id]  = 1;
+          // if old col type (not cat) is list or struct, replace with string.
+          auto& col = columns.at(old_col_id).get();
+          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
+            reinitialize_as_string(old_col_id, col);
+            remove_child_columns(old_col_id, col);
+            // all its children (which are already inserted) are ignored later.
+          }
+          col.forced_as_string_column = true;
+          columns.try_emplace(this_col_id, columns.at(old_col_id));
+          continue;
+        }
+      }
+
+      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
+        ignore_vals[this_col_id] = 1;
+        continue;
+      }
+      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
+        // remap
+        ignore_vals[old_col_id] = 1;
+        mapped_columns.erase({parent_col_id, name});
+        columns.erase(old_col_id);
+        parent_col.child_columns.erase(name);
+        replaced = true;  // to skip duplicate name in column_order
+      } else {
+        // If this is a nested column but we're trying to insert either (a) a list node into a
+        // struct column or (b) a struct node into a list column, we fail
+        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
+                          column_categories[this_col_id] == NC_STRUCT) or
+                         (column_categories[old_col_id] == NC_STRUCT and
+                          column_categories[this_col_id] == NC_LIST)),
+                     "A mix of lists and structs within the same column is not supported");
+      }
+    }
+
+    auto this_column_category = column_categories[this_col_id];
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
+    }
+
+    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
+    // move into parent
+    device_json_column col(stream, mr);
+    initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
+    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
+    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
+    if (not replaced) parent_col.column_order.push_back(name);
+    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
+    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
+  }
+
+  if (is_enabled_mixed_types_as_string) {
+    // ignore all children of mixed type columns
+    for (auto const this_col_id : unique_col_ids) {
+      auto parent_col_id = column_parent_ids[this_col_id];
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
+        is_mixed_type_column[this_col_id] = 1;
+        ignore_vals[this_col_id]          = 1;
+        columns.erase(this_col_id);
+      }
+      // Convert only mixed type columns as string (so to copy), but not its children
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
+          is_mixed_type_column[this_col_id] == 1)
+        column_categories[this_col_id] = NC_STR;
+    }
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
+  }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
+
+  // restore unique_col_ids order
+  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
+    return thrust::get<1>(a) < thrust::get<1>(b);
+  });
+  return {ignore_vals, columns};
+}
+
+void scatter_offsets(
+  tree_meta_t& tree,
+  device_span<NodeIndexT> col_ids,
+  device_span<size_type> row_offsets,
+  device_span<size_type> node_ids,
+  device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
+  tree_meta_t& d_column_tree,
+  host_span<const uint8_t> ignore_vals,
+  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>& columns,
+  rmm::cuda_stream_view stream)
+{
+  auto const num_nodes   = col_ids.size();
+  auto const num_columns = d_column_tree.node_categories.size();
+  // move columns data to device.
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
+  for (auto& [col_id, col_ref] : columns) {
+    if (col_id == parent_node_sentinel) continue;
+    auto& col            = col_ref.get();
+    columns_data[col_id] = json_column_data{col.string_offsets.data(),
+                                            col.string_lengths.data(),
+                                            col.child_offsets.data(),
+                                            static_cast<bitmask_type*>(col.validity.data())};
+  }
+
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, cudf::get_current_device_resource_ref());
+
+  // 3. scatter string offsets to respective columns, set validity bits
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<size_type>(0),
+    num_nodes,
+    [column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      switch (node_category) {
+        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
+        case NC_STR: [[fallthrough]];
+        case NC_VAL:
+          if (d_ignore_vals[col_ids[i]]) break;
+          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
+          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
+          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
+          break;
+        default: break;
+      }
+    });
+
+  // 4. scatter List offset
+  // copy_if only node's whose parent is list, (node_id, parent_col_id)
+  // stable_sort by parent_col_id of {node_id}.
+  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
+  //                                  (i==last, i+1!=i), write end offset.
+  //    unique_copy_by_key {parent_node_id} {row_offset} to
+  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
+
+  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
+  auto parent_col_id   = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<NodeIndexT>(
+      [col_ids         = col_ids.begin(),
+       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
+                                                                  : col_ids[parent_node_ids[node_id]];
+      }));
+  auto const list_children_end = thrust::copy_if(
+    rmm::exec_policy(stream),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
+    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
+      num_nodes,
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
+      auto parent_node_id = parent_node_ids[node_id];
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
+    });
+
+  auto const num_list_children =
+    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
+  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                             parent_col_ids.begin(),
+                             parent_col_ids.begin() + num_list_children,
+                             node_ids.begin());
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids        = node_ids.begin(),
+     parent_node_ids = tree.parent_node_ids.begin(),
+     parent_col_ids  = parent_col_ids.begin(),
+     row_offsets     = row_offsets.begin(),
+     d_columns_data  = d_columns_data.begin(),
+     num_list_children] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      // scatter to list_offset
+      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
+          row_offsets[node_id];
+      }
+      // last value of list child_offset is its size.
+      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
+        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
+          row_offsets[node_id] + 1;
+      }
+    });
+
+  // 5. scan on offsets.
+  for (auto& [id, col_ref] : columns) {
+    auto& col = col_ref.get();
+    if (col.type == json_col_t::StringColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.string_offsets.begin(),
+                             col.string_offsets.end(),
+                             col.string_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    } else if (col.type == json_col_t::ListColumn) {
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                             col.child_offsets.begin(),
+                             col.child_offsets.end(),
+                             col.child_offsets.begin(),
+                             thrust::maximum<json_column::row_offset_t>{});
+    }
+  }
+  stream.synchronize();
+}
+
+}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 756047d383a..b08fd139113 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -36,23 +35,16 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
-#include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
-#include <thrust/scan.h>
-#include <thrust/sort.h>
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
-#include <algorithm>
-#include <cstdint>
-
 namespace cudf::io::json::detail {
 
 // DEBUG prints
@@ -297,678 +289,6 @@ reduce_to_column_tree(tree_meta_t& tree,
                     std::move(max_row_offsets)};
 }
 
-/**
- * @brief Get the column indices for the values column for array of arrays rows
- *
- * @param row_array_children_level The level of the row array's children
- * @param d_tree The tree metadata
- * @param col_ids The column ids
- * @param num_columns The number of columns
- * @param stream The stream to use
- * @return The value columns' indices
- */
-rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_array_children_level,
-                                                          tree_meta_t const& d_tree,
-                                                          device_span<NodeIndexT> col_ids,
-                                                          size_type const num_columns,
-                                                          rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto [level2_nodes, level2_indices] = get_array_children_indices(
-    row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream);
-  auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin());
-  rmm::device_uvector<NodeIndexT> values_column_indices(num_columns, stream);
-  thrust::scatter(rmm::exec_policy(stream),
-                  level2_indices.begin(),
-                  level2_indices.end(),
-                  col_id_location,
-                  values_column_indices.begin());
-  return values_column_indices;
-}
-
-/**
- * @brief Copies strings specified by pair of begin, end offsets to host vector of strings.
- *
- * @param input String device buffer
- * @param node_range_begin Begin offset of the strings
- * @param node_range_end End offset of the strings
- * @param stream CUDA stream
- * @return Vector of strings
- */
-std::vector<std::string> copy_strings_to_host_sync(
-  device_span<SymbolT const> input,
-  device_span<SymbolOffsetT const> node_range_begin,
-  device_span<SymbolOffsetT const> node_range_end,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  auto const num_strings = node_range_begin.size();
-  rmm::device_uvector<size_type> string_offsets(num_strings, stream);
-  rmm::device_uvector<size_type> string_lengths(num_strings, stream);
-  auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offset_pairs,
-                    d_offset_pairs + num_strings,
-                    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()),
-                    [] __device__(auto const& offsets) {
-                      // Note: first character for non-field columns
-                      return thrust::make_tuple(
-                        static_cast<size_type>(thrust::get<0>(offsets)),
-                        static_cast<size_type>(thrust::get<1>(offsets) - thrust::get<0>(offsets)));
-                    });
-
-  cudf::io::parse_options_view options_view{};
-  options_view.quotechar  = '\0';  // no quotes
-  options_view.keepquotes = true;
-  auto d_offset_length_it =
-    thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin());
-  auto d_column_names = parse_data(input.data(),
-                                   d_offset_length_it,
-                                   num_strings,
-                                   data_type{type_id::STRING},
-                                   rmm::device_buffer{},
-                                   0,
-                                   options_view,
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
-  auto to_host        = [stream](auto const& col) {
-    if (col.is_empty()) return std::vector<std::string>{};
-    auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_host_vector_async<char>(
-      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_host_vector_async(
-      cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
-                                               scv.size() + 1),
-      stream);
-    stream.synchronize();
-
-    // build std::string vector from chars and offsets
-    std::vector<std::string> host_data;
-    host_data.reserve(col.size());
-    std::transform(
-      std::begin(h_offsets),
-      std::end(h_offsets) - 1,
-      std::begin(h_offsets) + 1,
-      std::back_inserter(host_data),
-      [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
-    return host_data;
-  };
-  return to_host(d_column_names->view());
-}
-
-/**
- * @brief Checks if all strings in each string column in the tree are nulls.
- * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as
- * false.
- *
- * @param input Input JSON string device data
- * @param d_column_tree column tree representation of JSON string
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param options Parsing options specifying the parsing behaviour
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Array of bytes where each byte indicate if it is all nulls string column.
- */
-rmm::device_uvector<uint8_t> is_all_nulls_each_column(device_span<SymbolT const> input,
-                                                      tree_meta_t const& d_column_tree,
-                                                      tree_meta_t const& tree,
-                                                      device_span<NodeIndexT> col_ids,
-                                                      cudf::io::json_reader_options const& options,
-                                                      rmm::cuda_stream_view stream)
-{
-  auto const num_nodes = col_ids.size();
-  auto const num_cols  = d_column_tree.node_categories.size();
-  rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
-  thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);
-
-  auto parse_opt = parsing_options(options, stream);
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [options           = parse_opt.view(),
-     data              = input.data(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     is_all_nulls      = is_all_nulls.begin()] __device__(size_type i) {
-      auto const node_category = column_categories[col_ids[i]];
-      if (node_category == NC_STR or node_category == NC_VAL) {
-        auto const is_null_literal = serialized_trie_contains(
-          options.trie_na,
-          {data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
-        if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
-      }
-    });
-  return is_all_nulls;
-}
-
-/**
- * @brief Holds member data pointers of `d_json_column`
- *
- */
-struct json_column_data {
-  using row_offset_t = json_column::row_offset_t;
-  row_offset_t* string_offsets;
-  row_offset_t* string_lengths;
-  row_offset_t* child_offsets;
-  bitmask_type* validity;
-};
-
-/**
- * @brief Constructs `d_json_column` from node tree representation
- * Newly constructed columns are insert into `root`'s children.
- * `root` must be a list type.
- *
- * @param input Input JSON string device data
- * @param tree Node tree representation of the JSON string
- * @param col_ids Column ids of the nodes in the tree
- * @param row_offsets Row offsets of the nodes in the tree
- * @param root Root node of the `d_json_column` tree
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param options Parsing options specifying the parsing behaviour
- * options affecting behaviour are
- *   is_enabled_lines: Whether the input is a line-delimited JSON
- *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the device memory
- * of child_offets and validity members of `d_json_column`
- */
-void make_device_json_column(device_span<SymbolT const> input,
-                             tree_meta_t& tree,
-                             device_span<NodeIndexT> col_ids,
-                             device_span<size_type> row_offsets,
-                             device_json_column& root,
-                             bool is_array_of_arrays,
-                             cudf::io::json_reader_options const& options,
-                             rmm::cuda_stream_view stream,
-                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  bool const is_enabled_lines                 = options.is_enabled_lines();
-  bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
-  auto const num_nodes                        = col_ids.size();
-  rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
-  thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
-
-  // sort by {col_id} on {node_ids} stable
-  rmm::device_uvector<NodeIndexT> node_ids(col_ids.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end());
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());
-
-  NodeIndexT const row_array_parent_col_id = [&]() {
-    NodeIndexT value = parent_node_sentinel;
-    if (!col_ids.empty()) {
-      auto const list_node_index = is_enabled_lines ? 0 : 1;
-      CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                    col_ids.data() + list_node_index,
-                                    sizeof(NodeIndexT),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
-      stream.synchronize();
-    }
-    return value;
-  }();
-
-  // 1. gather column information.
-  auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] =
-    reduce_to_column_tree(tree,
-                          col_ids,
-                          sorted_col_ids,
-                          node_ids,
-                          row_offsets,
-                          is_array_of_arrays,
-                          row_array_parent_col_id,
-                          stream);
-  auto num_columns    = d_unique_col_ids.size();
-  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
-  auto column_categories =
-    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
-  auto const column_parent_ids =
-    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
-  auto column_range_beg =
-    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
-  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host_sync(
-    input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // array of arrays column names
-  if (is_array_of_arrays) {
-    TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
-    auto values_column_indices =
-      get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
-    auto h_values_column_indices =
-      cudf::detail::make_host_vector_sync(values_column_indices, stream);
-    std::transform(unique_col_ids.begin(),
-                   unique_col_ids.end(),
-                   column_names.begin(),
-                   column_names.begin(),
-                   [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id](
-                     auto col_id, auto name) mutable {
-                     return column_parent_ids[col_id] == row_array_parent_col_id
-                              ? std::to_string(h_values_column_indices[col_id])
-                              : name;
-                   });
-  }
-
-  auto to_json_col_type = [](auto category) {
-    switch (category) {
-      case NC_STRUCT: return json_col_t::StructColumn;
-      case NC_LIST: return json_col_t::ListColumn;
-      case NC_STR: [[fallthrough]];
-      case NC_VAL: return json_col_t::StringColumn;
-      default: return json_col_t::Unknown;
-    }
-  };
-  auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
-  };
-
-  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
-    if (column_category == NC_ERR || column_category == NC_FN) {
-      return;
-    } else if (column_category == NC_VAL || column_category == NC_STR) {
-      col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-      col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-      init_to_zero(col.string_offsets);
-      init_to_zero(col.string_lengths);
-    } else if (column_category == NC_LIST) {
-      col.child_offsets.resize(max_row_offsets[i] + 2, stream);
-      init_to_zero(col.child_offsets);
-    }
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_category);
-  };
-
-  auto reinitialize_as_string = [&](auto i, auto& col) {
-    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
-    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
-    init_to_zero(col.string_offsets);
-    init_to_zero(col.string_lengths);
-    col.num_rows = max_row_offsets[i] + 1;
-    col.validity =
-      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = json_col_t::StringColumn;
-    // destroy references of all child columns after this step, by calling remove_child_columns
-  };
-
-  path_from_tree tree_path{column_categories,
-                           column_parent_ids,
-                           column_names,
-                           is_array_of_arrays,
-                           row_array_parent_col_id};
-
-  // 2. generate nested columns tree and its device_memory
-  // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
-  auto h_range_col_id_it =
-    thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin());
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<0>(a) < thrust::get<0>(b);
-  });
-
-  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
-    if (is_enabled_mixed_types_as_string) {
-      return cudf::detail::make_host_vector_sync(
-        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
-    }
-    return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
-  }();
-
-  // use hash map because we may skip field name's col_ids
-  std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
-  // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
-  std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
-  // find column_ids which are values, but should be ignored in validity
-  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
-  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
-  std::vector<uint8_t> is_pruned(num_columns, 0);
-  // for columns that are not mixed type but have been forced as string
-  std::vector<bool> forced_as_string_column(num_columns);
-  columns.try_emplace(parent_node_sentinel, std::ref(root));
-
-  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
-    [&](NodeIndexT this_col_id, device_json_column& col) {
-      for (auto col_name : col.column_order) {
-        auto child_id                  = mapped_columns[{this_col_id, col_name}];
-        is_mixed_type_column[child_id] = 1;
-        remove_child_columns(child_id, col.child_columns.at(col_name));
-        mapped_columns.erase({this_col_id, col_name});
-        columns.erase(child_id);
-      }
-      col.child_columns.clear();  // their references are deleted above.
-      col.column_order.clear();
-    };
-
-  auto name_and_parent_index = [&is_array_of_arrays,
-                                &row_array_parent_col_id,
-                                &column_parent_ids,
-                                &column_categories,
-                                &column_names](auto this_col_id) {
-    std::string name   = "";
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
-    }
-    return std::pair{name, parent_col_id};
-  };
-
-  // Prune columns that are not required to be parsed.
-  if (options.is_enabled_prune_columns()) {
-    for (auto const this_col_id : unique_col_ids) {
-      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-        continue;
-      }
-      // Struct, List, String, Value
-      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-      // get path of this column, and get its dtype if present in options
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
-        is_pruned[this_col_id] = 1;
-        continue;
-      } else {
-        // make sure all its parents are not pruned.
-        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
-          is_pruned[parent_col_id] = 0;
-          parent_col_id            = column_parent_ids[parent_col_id];
-        }
-      }
-    }
-  }
-
-  // Build the column tree, also, handles mixed types.
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
-    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
-
-    // if parent is mixed type column or this column is pruned or if parent
-    // has been forced as string, ignore this column.
-    if (parent_col_id != parent_node_sentinel &&
-          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
-        forced_as_string_column[parent_col_id]) {
-      ignore_vals[this_col_id] = 1;
-      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
-      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
-      continue;
-    }
-
-    // If the child is already found,
-    // replace if this column is a nested column and the existing was a value column
-    // ignore this column if this column is a value column and the existing was a nested column
-    auto it = columns.find(parent_col_id);
-    CUDF_EXPECTS(it != columns.end(), "Parent column not found");
-    auto& parent_col = it->second.get();
-    bool replaced    = false;
-    if (mapped_columns.count({parent_col_id, name}) > 0) {
-      auto const old_col_id = mapped_columns[{parent_col_id, name}];
-      // If mixed type as string is enabled, make both of them strings and merge them.
-      // All child columns will be ignored when parsing.
-      if (is_enabled_mixed_types_as_string) {
-        bool const is_mixed_type = [&]() {
-          // If new or old is STR and they are all not null, make it mixed type, else ignore.
-          if (column_categories[this_col_id] == NC_VAL ||
-              column_categories[this_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[this_col_id]) return false;
-          }
-          if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-            if (is_str_column_all_nulls[old_col_id]) return false;
-          }
-          return true;
-        }();
-        if (is_mixed_type) {
-          is_mixed_type_column[this_col_id] = 1;
-          is_mixed_type_column[old_col_id]  = 1;
-          // if old col type (not cat) is list or struct, replace with string.
-          auto& col = columns.at(old_col_id).get();
-          if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
-            reinitialize_as_string(old_col_id, col);
-            remove_child_columns(old_col_id, col);
-            // all its children (which are already inserted) are ignored later.
-          }
-          col.forced_as_string_column = true;
-          columns.try_emplace(this_col_id, columns.at(old_col_id));
-          continue;
-        }
-      }
-
-      if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
-        ignore_vals[this_col_id] = 1;
-        continue;
-      }
-      if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
-        // remap
-        ignore_vals[old_col_id] = 1;
-        mapped_columns.erase({parent_col_id, name});
-        columns.erase(old_col_id);
-        parent_col.child_columns.erase(name);
-        replaced = true;  // to skip duplicate name in column_order
-      } else {
-        // If this is a nested column but we're trying to insert either (a) a list node into a
-        // struct column or (b) a struct node into a list column, we fail
-        CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and
-                          column_categories[this_col_id] == NC_STRUCT) or
-                         (column_categories[old_col_id] == NC_STRUCT and
-                          column_categories[this_col_id] == NC_LIST)),
-                     "A mix of lists and structs within the same column is not supported");
-      }
-    }
-
-    auto this_column_category = column_categories[this_col_id];
-    // get path of this column, check if it is a struct/list forced as string, and enforce it
-    auto const nt                             = tree_path.get_path(this_col_id);
-    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      this_column_category = NC_STR;
-    }
-
-    CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
-    // move into parent
-    device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col, this_column_category);
-    if ((column_categories[this_col_id] == NC_STRUCT or
-         column_categories[this_col_id] == NC_LIST) and
-        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-      col.forced_as_string_column          = true;
-      forced_as_string_column[this_col_id] = true;
-    }
-
-    auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
-    CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
-    if (not replaced) parent_col.column_order.push_back(name);
-    columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
-    mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
-  }
-
-  if (is_enabled_mixed_types_as_string) {
-    // ignore all children of mixed type columns
-    for (auto const this_col_id : unique_col_ids) {
-      auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
-        is_mixed_type_column[this_col_id] = 1;
-        ignore_vals[this_col_id]          = 1;
-        columns.erase(this_col_id);
-      }
-      // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
-          is_mixed_type_column[this_col_id] == 1)
-        column_categories[this_col_id] = NC_STR;
-    }
-    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                    column_categories.data(),
-                                    column_categories.size() * sizeof(column_categories[0]),
-                                    cudf::detail::host_memory_kind::PAGEABLE,
-                                    stream);
-  }
-
-  // ignore all children of columns forced as string
-  for (auto const this_col_id : unique_col_ids) {
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
-      forced_as_string_column[this_col_id] = true;
-      ignore_vals[this_col_id]             = 1;
-    }
-    // Convert only mixed type columns as string (so to copy), but not its children
-    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
-        forced_as_string_column[this_col_id])
-      column_categories[this_col_id] = NC_STR;
-  }
-  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
-                                  column_categories.data(),
-                                  column_categories.size() * sizeof(column_categories[0]),
-                                  cudf::detail::host_memory_kind::PAGEABLE,
-                                  stream);
-
-  // restore unique_col_ids order
-  std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
-    return thrust::get<1>(a) < thrust::get<1>(b);
-  });
-  // move columns data to device.
-  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
-  for (auto& [col_id, col_ref] : columns) {
-    if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
-    columns_data[col_id] = json_column_data{col.string_offsets.data(),
-                                            col.string_lengths.data(),
-                                            col.child_offsets.data(),
-                                            static_cast<bitmask_type*>(col.validity.data())};
-  }
-
-  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, cudf::get_current_device_resource_ref());
-  auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, cudf::get_current_device_resource_ref());
-
-  // 3. scatter string offsets to respective columns, set validity bits
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::counting_iterator<size_type>(0),
-    num_nodes,
-    [column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin(),
-     row_offsets       = row_offsets.begin(),
-     range_begin       = tree.node_range_begin.begin(),
-     range_end         = tree.node_range_end.begin(),
-     d_ignore_vals     = d_ignore_vals.begin(),
-     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
-      if (d_ignore_vals[col_ids[i]]) return;
-      auto const node_category = column_categories[col_ids[i]];
-      switch (node_category) {
-        case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
-        case NC_STR: [[fallthrough]];
-        case NC_VAL:
-          if (d_ignore_vals[col_ids[i]]) break;
-          set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]);
-          d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i];
-          d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i];
-          break;
-        default: break;
-      }
-    });
-
-  // 4. scatter List offset
-  // copy_if only node's whose parent is list, (node_id, parent_col_id)
-  // stable_sort by parent_col_id of {node_id}.
-  // For all unique parent_node_id of (i==0, i-1!=i), write start offset.
-  //                                  (i==last, i+1!=i), write end offset.
-  //    unique_copy_by_key {parent_node_id} {row_offset} to
-  //    col[parent_col_id].child_offsets[row_offset[parent_node_id]]
-
-  auto& parent_col_ids = sorted_col_ids;  // reuse sorted_col_ids
-  auto parent_col_id   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<NodeIndexT>(
-      [col_ids         = col_ids.begin(),
-       parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
-        return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel
-                                                                  : col_ids[parent_node_ids[node_id]];
-      }));
-  auto const list_children_end = thrust::copy_if(
-    rmm::exec_policy(stream),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id),
-    thrust::make_zip_iterator(thrust::make_counting_iterator<size_type>(0), parent_col_id) +
-      num_nodes,
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [d_ignore_vals     = d_ignore_vals.begin(),
-     parent_node_ids   = tree.parent_node_ids.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
-     col_ids           = col_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST and
-             (!d_ignore_vals[col_ids[parent_node_id]]);
-    });
-
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             parent_col_ids.begin(),
-                             parent_col_ids.begin() + num_list_children,
-                             node_ids.begin());
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    num_list_children,
-    [node_ids        = node_ids.begin(),
-     parent_node_ids = tree.parent_node_ids.begin(),
-     parent_col_ids  = parent_col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     d_columns_data  = d_columns_data.begin(),
-     num_list_children] __device__(size_type i) {
-      auto const node_id        = node_ids[i];
-      auto const parent_node_id = parent_node_ids[node_id];
-      // scatter to list_offset
-      if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] =
-          row_offsets[node_id];
-      }
-      // last value of list child_offset is its size.
-      if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) {
-        d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] =
-          row_offsets[node_id] + 1;
-      }
-    });
-
-  // 5. scan on offsets.
-  for (auto& [id, col_ref] : columns) {
-    auto& col = col_ref.get();
-    if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.string_offsets.begin(),
-                             col.string_offsets.end(),
-                             col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                             col.child_offsets.begin(),
-                             col.child_offsets.end(),
-                             col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
-    }
-  }
-  stream.synchronize();
-}
-
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 75639a0438f..83f71e657a7 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -299,22 +299,58 @@ get_array_children_indices(TreeDepthT row_array_children_level,
                            device_span<TreeDepthT const> node_levels,
                            device_span<NodeIndexT const> parent_node_ids,
                            rmm::cuda_stream_view stream);
+
 /**
- * @brief Reduce node tree into column tree by aggregating each property of column.
+ * @brief Reduces node tree representation to column tree representation.
  *
- * @param tree json node tree to reduce (modified in-place, but restored to original state)
- * @param col_ids column ids of each node (modified in-place, but restored to original state)
- * @param row_offsets row offsets of each node (modified in-place, but restored to original state)
- * @param stream The CUDA stream to which kernels are dispatched
- * @return A tuple containing the column tree, identifier for each column and the maximum row index
- * in each column
+ * @param tree Node tree representation of JSON string
+ * @param original_col_ids Column ids of nodes
+ * @param sorted_col_ids Sorted column ids of nodes
+ * @param ordered_node_ids Node ids of nodes sorted by column ids
+ * @param row_offsets Row offsets of nodes
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of column tree representation of JSON string, column ids of columns, and
+ * max row offsets of columns
  */
 std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> col_ids,
+                      device_span<NodeIndexT> original_col_ids,
+                      device_span<NodeIndexT> sorted_col_ids,
+                      device_span<NodeIndexT> ordered_node_ids,
                       device_span<size_type> row_offsets,
+                      bool is_array_of_arrays,
+                      NodeIndexT const row_array_parent_col_id,
                       rmm::cuda_stream_view stream);
-
+/**
+ * @brief Constructs `d_json_column` from node tree representation
+ * Newly constructed columns are insert into `root`'s children.
+ * `root` must be a list type.
+ *
+ * @param input Input JSON string device data
+ * @param tree Node tree representation of the JSON string
+ * @param col_ids Column ids of the nodes in the tree
+ * @param row_offsets Row offsets of the nodes in the tree
+ * @param root Root node of the `d_json_column` tree
+ * @param is_array_of_arrays Whether the tree is an array of arrays
+ * @param options Parsing options specifying the parsing behaviour
+ * options affecting behaviour are
+ *   is_enabled_lines: Whether the input is a line-delimited JSON
+ *   is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the device memory
+ * of child_offets and validity members of `d_json_column`
+ */
+void make_device_json_column(device_span<SymbolT const> input,
+                             tree_meta_t& tree,
+                             device_span<NodeIndexT> col_ids,
+                             device_span<size_type> row_offsets,
+                             device_json_column& root,
+                             bool is_array_of_arrays,
+                             cudf::io::json_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 /**
  * @brief Retrieves the parse_options to be used for type inference and type casting
  *
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 748691fb7d1..2ec23e0dc6d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -27,7 +27,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
+                       rmm::cuda_stream_view stream,
                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::INNER_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -395,16 +390,12 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::LEFT_JOIN,
-                                  output_size,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -412,16 +403,12 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
+                      rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join(left,
-                                  right,
-                                  binary_predicate,
-                                  detail::join_kind::FULL_JOIN,
-                                  {},
-                                  cudf::get_default_stream(),
-                                  mr);
+  return detail::conditional_join(
+    left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
@@ -429,16 +416,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -446,64 +429,56 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::conditional_join_anti_semi(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr);
+  return detail::conditional_join_anti_semi(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr);
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
+                                            rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_conditional_join_output_size(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      cudf::get_default_stream(),
-                                                      mr);
+  return detail::compute_conditional_join_output_size(
+    left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 4f6a9484e8c..303442e79ef 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -19,7 +19,6 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index eeb49736bac..15594fb60e3 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -25,7 +25,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -75,10 +74,11 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cross_join(left, right, cudf::get_default_stream(), mr);
+  return detail::cross_join(left, right, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 0abff27667b..7b13c260364 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -20,7 +20,6 @@
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -120,10 +119,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
+           rmm::cuda_stream_view stream,
            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::inner_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -131,10 +131,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::left_join(left, right, compare_nulls, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -142,10 +143,11 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
+          rmm::cuda_stream_view stream,
           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
+  return detail::full_join(left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 8ff78dd47f4..820b81ee309 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -28,7 +28,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -484,6 +483,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -495,7 +495,7 @@ mixed_inner_join(
                             compare_nulls,
                             detail::join_kind::INNER_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -506,6 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -516,7 +517,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::INNER_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -530,6 +531,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -541,7 +543,7 @@ mixed_left_join(
                             compare_nulls,
                             detail::join_kind::LEFT_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
@@ -552,6 +554,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -562,7 +565,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::LEFT_JOIN,
-                                                cudf::get_default_stream(),
+                                                stream,
                                                 mr);
 }
 
@@ -576,6 +579,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -587,7 +591,7 @@ mixed_full_join(
                             compare_nulls,
                             detail::join_kind::FULL_JOIN,
                             output_size_data,
-                            cudf::get_default_stream(),
+                            stream,
                             mr);
 }
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index cfb785e242c..aa4fa281159 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -29,7 +29,6 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -267,6 +266,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -277,7 +277,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
@@ -288,6 +288,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -298,7 +299,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 cudf::get_default_stream(),
+                                 stream,
                                  mr);
 }
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index f69ded73e8d..d2ab2122c75 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -23,7 +23,6 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -98,22 +97,24 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
+    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1bedb344a01..288fa84a73d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -687,10 +687,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index ab387a5c7f5..3431e941359 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -39,6 +39,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -60,6 +62,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
+            rmm::cuda_stream_view stream,
             rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
@@ -68,12 +71,13 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] =
-    join_impl(left_selected, right_selected, compare_nulls, mr);
+    join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
@@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture {
     auto const probe_tv = cudf::table_view{{probe}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref());
+      join_func(build_tv,
+                probe_tv,
+                nulls_equal,
+                cudf::get_default_stream(),
+                cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
       sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 3e279260b99..554d5754e39 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -28,8 +28,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -51,6 +54,7 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -58,11 +62,12 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
-  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr);
+  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr);
 
   auto left_indices_span = cudf::device_span<cudf::size_type const>{*join_indices};
   auto left_indices_col  = cudf::column_view{left_indices_span};
diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp
new file mode 100644
index 00000000000..82629156fa6
--- /dev/null
+++ b/cpp/tests/streams/datetime_test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+#include <cstdint>
+
+class DatetimeTest : public cudf::test::BaseFixture {
+ public:
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps{
+    -23324234,  // 1969-12-31 23:59:59.976675766 GMT
+    23432424,   // 1970-01-01 00:00:00.023432424 GMT
+    987234623   // 1970-01-01 00:00:00.987234623 GMT
+  };
+  cudf::test::fixed_width_column_wrapper<int32_t, int32_t> months{{1, -1, 3}};
+};
+
+TEST_F(DatetimeTest, ExtractYear)
+{
+  cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMonth)
+{
+  cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractDay)
+{
+  cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractWeekday)
+{
+  cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractHour)
+{
+  cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMinute)
+{
+  cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractSecond)
+{
+  cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMillisecondFraction)
+{
+  cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractMicrosecondFraction)
+{
+  cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractNanosecondFraction)
+{
+  cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, LastDayOfMonth)
+{
+  cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DayOfYear)
+{
+  cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonths)
+{
+  cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, AddCalendricalMonthsScalar)
+{
+  auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream());
+
+  cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, IsLeapYear)
+{
+  cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, DaysInMonth)
+{
+  cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, ExtractQuarter)
+{
+  cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, CeilDatetimes)
+{
+  cudf::datetime::ceil_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, FloorDatetimes)
+{
+  cudf::datetime::floor_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
+
+TEST_F(DatetimeTest, RoundDatetimes)
+{
+  cudf::datetime::round_datetimes(
+    timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
new file mode 100644
index 00000000000..2811bb676fa
--- /dev/null
+++ b/cpp/tests/streams/join_test.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+class JoinTest : public cudf::test::BaseFixture {
+  static inline cudf::table make_table()
+  {
+    cudf::test::fixed_width_column_wrapper<int32_t> col0{{3, 1, 2, 0, 3}};
+    cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}};
+    cudf::test::fixed_width_column_wrapper<int32_t> col2{{0, 1, 2, 4, 1}};
+
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.push_back(col0.release());
+    columns.push_back(col1.release());
+    columns.push_back(col2.release());
+
+    return cudf::table{std::move(columns)};
+  }
+
+ public:
+  cudf::table table0{make_table()};
+  cudf::table table1{make_table()};
+  cudf::table conditional0{make_table()};
+  cudf::table conditional1{make_table()};
+  cudf::ast::column_reference col_ref_left_0{0};
+  cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT};
+  cudf::ast::operation left_zero_eq_right_zero{
+    cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0};
+};
+
+TEST_F(JoinTest, InnerJoin)
+{
+  cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftJoin)
+{
+  cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, FullJoin)
+{
+  cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftSemiJoin)
+{
+  cudf::left_semi_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, LeftAntiJoin)
+{
+  cudf::left_anti_join(
+    table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); }
+
+TEST_F(JoinTest, ConditionalInnerJoin)
+{
+  cudf::conditional_inner_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoin)
+{
+  cudf::conditional_left_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalFullJoin)
+{
+  cudf::conditional_full_join(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoin)
+{
+  cudf::conditional_left_semi_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoin)
+{
+  cudf::conditional_left_anti_join(
+    table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoin)
+{
+  cudf::mixed_inner_join(table0,
+                         table1,
+                         conditional0,
+                         conditional1,
+                         left_zero_eq_right_zero,
+                         cudf::null_equality::EQUAL,
+                         std::nullopt,
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoin)
+{
+  cudf::mixed_left_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedFullJoin)
+{
+  cudf::mixed_full_join(table0,
+                        table1,
+                        conditional0,
+                        conditional1,
+                        left_zero_eq_right_zero,
+                        cudf::null_equality::EQUAL,
+                        std::nullopt,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftSemiJoin)
+{
+  cudf::mixed_left_semi_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftAntiJoin)
+{
+  cudf::mixed_left_anti_join(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedInnerJoinSize)
+{
+  cudf::mixed_inner_join_size(table0,
+                              table1,
+                              conditional0,
+                              conditional1,
+                              left_zero_eq_right_zero,
+                              cudf::null_equality::EQUAL,
+                              cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, MixedLeftJoinSize)
+{
+  cudf::mixed_left_join_size(table0,
+                             table1,
+                             conditional0,
+                             conditional1,
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::EQUAL,
+                             cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalInnerJoinSize)
+{
+  cudf::conditional_inner_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftJoinSize)
+{
+  cudf::conditional_left_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftSemiJoinSize)
+{
+  cudf::conditional_left_semi_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
+
+TEST_F(JoinTest, ConditionalLeftAntiJoinSize)
+{
+  cudf::conditional_left_anti_join_size(
+    table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream());
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index 7a13043cc5f..2f2d7ba679e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -650,7 +650,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0,<1.3
+          - polars>=1.6
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png
new file mode 100644
index 00000000000..e472cf66612
Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ
diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png
new file mode 100644
index 00000000000..6854ed5a436
Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ
diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png
new file mode 100644
index 00000000000..d0b48ab2901
Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
new file mode 100644
index 00000000000..cc7aabd124f
--- /dev/null
+++ b/docs/cudf/source/cudf_polars/index.rst
@@ -0,0 +1,41 @@
+cuDF-based GPU backend for Polars [Open Beta]
+=============================================
+
+cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
+The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
+and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
+whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
+and run on the CPU.
+
+Benchmark
+---------
+We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
+
+.. figure:: ../_static/pds_benchmark_polars.png
+   :width: 600px
+
+
+
+You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
+
+
+.. figure:: ../_static/compute_heavy_queries_polars.png
+   :width: 1000px
+
+:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe`
+
+You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository <https://github.com/pola-rs/polars-benchmark>`__.
+
+Learn More
+----------
+
+The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
+
+Launch on Google Colab
+----------------------
+
+.. figure:: ../_static/colab.png
+   :width: 200px
+   :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
+
+   Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 3b8dfa5fe01..1b86cafeb48 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -29,5 +29,6 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   cudf_polars/index
    libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 2518afc80a7..003e7c0c35e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -14,3 +14,4 @@ strings
     repeat
     replace
     slice
+    strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
new file mode 100644
index 00000000000..a79774b8e67
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
@@ -0,0 +1,6 @@
+=====
+strip
+=====
+
+.. automodule:: pylibcudf.strings.strip
+   :members:
diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst
new file mode 100644
index 00000000000..142124163af
--- /dev/null
+++ b/docs/dask_cudf/source/best_practices.rst
@@ -0,0 +1,320 @@
+.. _best-practices:
+
+Dask cuDF Best Practices
+========================
+
+This page outlines several important guidelines for using `Dask cuDF
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ effectively.
+
+.. note::
+  Since Dask cuDF is a backend extension for
+  `Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__,
+  the guidelines discussed in the `Dask DataFrames Best Practices
+  <https://docs.dask.org/en/stable/dataframe-best-practices.html>`__
+  documentation also apply to Dask cuDF (excluding any pandas-specific
+  details).
+
+
+Deployment and Configuration
+----------------------------
+
+Use Dask-CUDA
+~~~~~~~~~~~~~
+
+To execute a Dask workflow on multiple GPUs, a Dask cluster must
+be deployed with `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Dask.distributed <https://distributed.dask.org/en/stable/>`__.
+
+When running on a single machine, the `LocalCUDACluster <https://docs.rapids.ai/api/dask-cuda/stable/api/#dask_cuda.LocalCUDACluster>`__
+convenience function is strongly recommended. No matter how many GPUs are
+available on the machine (even one!), using `Dask-CUDA has many advantages
+<https://docs.rapids.ai/api/dask-cuda/stable/#motivation>`__
+over default (threaded) execution. Just to list a few:
+
+* Dask-CUDA makes it easy to pin workers to specific devices.
+* Dask-CUDA makes it easy to configure memory-spilling options.
+* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time.
+
+Please see `Dask-CUDA's API <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+and `Best Practices <https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__
+documentation for detailed information. Typical ``LocalCUDACluster`` usage
+is also illustrated within the multi-GPU section of `Dask cuDF's
+<https://docs.rapids.ai/api/dask-cudf/stable/>`__ documentation.
+
+.. note::
+  When running on cloud infrastructure or HPC systems, it is usually best to
+  leverage system-specific deployment libraries like `Dask Operator
+  <https://docs.dask.org/en/latest/deploying-kubernetes.html>`__ and `Dask-Jobqueue
+  <https://jobqueue.dask.org/en/latest/>`__.
+
+  Please see `the RAPIDS deployment documentation <https://docs.rapids.ai/deployment/stable/>`__
+  for further details and examples.
+
+
+Use diagnostic tools
+~~~~~~~~~~~~~~~~~~~~
+
+The Dask ecosystem includes several diagnostic tools that you should absolutely use.
+These tools include an intuitive `browser dashboard
+<https://docs.dask.org/en/stable/dashboard.html>`__ as well as a dedicated
+`API for collecting performance profiles
+<https://distributed.dask.org/en/latest/diagnosing-performance.html#performance-reports>`__.
+
+No matter the workflow, using the dashboard is strongly recommended.
+It provides a visual representation of the worker resources and compute
+progress. It also shows basic GPU memory and utilization metrics (under
+the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab,
+use `NVDashboard <https://github.com/rapidsai/jupyterlab-nvdashboard>`__.
+
+
+Enable cuDF spilling
+~~~~~~~~~~~~~~~~~~~~
+
+When using Dask cuDF for classic ETL workloads, it is usually best
+to enable `native spilling support in cuDF
+<https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory>`__.
+When using :func:`LocalCUDACluster`, this is easily accomplished by
+setting ``enable_cudf_spill=True``.
+
+When a Dask cuDF workflow includes conversion between DataFrame and Array
+representations, native cuDF spilling may be insufficient. For these cases,
+`JIT-unspill <https://docs.rapids.ai/api/dask-cuda/nightly/spilling/#jit-unspill>`__
+is likely to produce better protection from out-of-memory (OOM) errors.
+Please see `Dask-CUDA's spilling documentation
+<https://docs.rapids.ai/api/dask-cuda/24.10/spilling/>`__ for further details
+and guidance.
+
+Use RMM
+~~~~~~~
+
+Memory allocations in cuDF are significantly faster and more efficient when
+the `RAPIDS Memory Manager (RMM) <https://docs.rapids.ai/api/rmm/stable/>`__
+library is configured appropriately on worker processes. In most cases, the best way to manage
+memory is by initializing an RMM pool on each worker before executing a
+workflow. When using :func:`LocalCUDACluster`, this is easily accomplished
+by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``).
+
+See the `Dask-CUDA memory-management documentation
+<https://docs.rapids.ai/api/dask-cuda/nightly/examples/best-practices/#gpu-memory-management>`__
+for more details.
+
+Use the Dask DataFrame API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask cuDF provides a public ``dask_cudf`` Python module, we
+strongly recommended that you use the CPU/GPU portable ``dask.dataframe``
+API instead. Simply `use the Dask configuration system
+<https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html>`__
+to set the ``"dataframe.backend"`` option to ``"cudf"``, and the
+``dask_cudf`` module will be imported and used implicitly.
+
+Be sure to use the :func:`to_backend` method if you need to convert
+between the different DataFrame backends. For example::
+
+  df = df.to_backend("pandas")  # This gives us a pandas-backed collection
+
+.. note::
+  Although :func:`to_backend` makes it easy to move data between pandas
+  and cuDF, repetitive CPU-GPU data movement can degrade performance
+  significantly. For optimal results, keep your data on the GPU as much
+  as possible.
+
+Avoid eager execution
+~~~~~~~~~~~~~~~~~~~~~
+
+Although Dask DataFrame collections are lazy by default, there are several
+notable methods that will result in the immediate execution of the
+underlying task graph:
+
+:func:`compute`: Calling ``ddf.compute()`` will materialize the result of
+``ddf`` and return a single cuDF object. This is done by executing the entire
+task graph associated with ``ddf`` and concatenating its partitions in
+local memory on the client process.
+
+.. note::
+  Never call :func:`compute` on a large collection that cannot fit comfortably
+  in the memory of a single GPU!
+
+:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will
+execute the entire task graph associated with ``ddf``. The most important
+difference is that the computed partitions will remain in distributed
+worker memory instead of being concatenated together on the client process.
+Another difference is that :func:`persist` will return immediately when
+executing on a distributed cluster. If you need a blocking synchronization
+point in your workflow, simply use the :func:`wait` function::
+
+  ddf = ddf.persist()
+  wait(ddf)
+
+.. note::
+  Avoid calling :func:`persist` on a large collection that cannot fit comfortably
+  in global worker memory. If the total sum of the partition sizes is larger
+  than the sum of all GPU memory, calling persist will result in significant
+  spilling from device memory. If the individual partition sizes are large, this
+  is likely to produce an OOM error.
+
+:func:`len` / :func:`head` / :func:`tail`: Although these operations are used
+often within pandas/cuDF code to quickly inspect data, it is best to avoid
+them in Dask DataFrame. In most cases, these operations will execute some or all
+of the underlying task graph to materialize the collection.
+
+:func:`sort_values` / :func:`set_index` : These operations both require Dask to
+eagerly collect quantile information about the column(s) being targeted by the
+global sort operation. See `Avoid Sorting`__ for notes on sorting considerations.
+
+.. note::
+  When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the
+  global collection does not **need** to be sorted by the new index.
+
+Avoid Sorting
+~~~~~~~~~~~~~
+
+`The design of Dask DataFrame <https://docs.dask.org/en/stable/dataframe-design.html#dask-dataframe-design>`__
+makes it advantageous to work with data that is already sorted along its index at
+creation time. For most other cases, it is best to avoid sorting unless the logic
+of the workflow makes global ordering absolutely necessary.
+
+If the purpose of a :func:`sort_values` operation is to ensure that all unique
+values in ``by`` will be moved to the same output partition, then `shuffle
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.shuffle.html>`__
+is often the better option.
+
+
+Reading Data
+------------
+
+Tune the partition size
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The ideal partition size is usually between 1/32 and 1/8 the memory
+capacity of a single GPU. Increasing the partition size will typically
+reduce the number of tasks in your workflow and improve the GPU utilization
+for each task. However, if the partitions are too large, the risk of OOM
+errors can become significant.
+
+.. note::
+  As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows
+  (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically
+  skewed data distributions, it may be necessary to target 1/64 or smaller.
+  This rule of thumb comes from anecdotal optimization and OOM-debugging
+  experience. Since every workflow is different, choosing the best partition
+  size is both an art and a science.
+
+The easiest way to tune the partition size is when the DataFrame collection
+is first created by a function like :func:`read_parquet`, :func:`read_csv`,
+or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv`
+expose a ``blocksize`` argument for adjusting the maximum partition size.
+
+If the partition size cannot be tuned effectively at creation time, the
+`repartition <https://docs.dask.org/en/latest/generated/dask.dataframe.DataFrame.repartition.html>`__
+method can be used as a last resort.
+
+
+Use Parquet
+~~~~~~~~~~~
+
+`Parquet <https://parquet.apache.org/docs/file-format/>`__ is the recommended
+file format for Dask cuDF. It provides efficient columnar storage and enables
+Dask to perform valuable query optimizations like column projection and
+predicate pushdown.
+
+The most important arguments to :func:`read_parquet` are ``blocksize`` and
+``aggregate_files``:
+
+``blocksize``: Use this argument to specify the maximum partition size.
+The default is `"256 MiB"`, but larger values are usually more performant
+on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize``
+value to map a discrete number of Parquet row-groups (or files) to each
+output partition. This mapping will only account for the uncompressed
+storage size of each row group, which is usually smaller than the
+correspondng ``cudf.DataFrame``.
+
+``aggregate_files``: Use this argument to specify whether Dask should
+map multiple files to the same DataFrame partition. The default is
+``False``, but ``aggregate_files=True`` is usually more performant when
+the dataset contains many files that are smaller than half of ``blocksize``.
+
+If you know that your files correspond to a reasonable partition size
+before splitting or aggregation, set ``blocksize=None`` to disallow
+file splitting. In the absence of column-projection pushdown, this will
+result in a simple 1-to-1 mapping between files and output partitions.
+
+.. note::
+  If your workflow requires a strict 1-to-1 mapping between files and
+  partitions, use :func:`from_map` to manually construct your partitions
+  with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used,
+  query-planning optimizations may automatically aggregate distinct files
+  into the same partition (even when ``aggregate_files=False``).
+
+.. note::
+  Metadata collection can be extremely slow when reading from remote
+  storage (e.g. S3 and GCS). When reading many remote files that all
+  correspond to a reasonable partition size, use ``blocksize=None``
+  to avoid unnecessary metadata collection.
+
+
+Use :func:`from_map`
+~~~~~~~~~~~~~~~~~~~~
+
+To implement custom DataFrame-creation logic that is not covered by
+existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map`
+whenever possible. The :func:`from_map` API has several advantages
+over :func:`from_delayed`:
+
+* It allows proper lazy execution of your custom logic
+* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument)
+
+See the `from_map API documentation <https://docs.dask.org/en/stable/generated/dask_expr.from_map.html#dask_expr.from_map>`__
+for more details.
+
+.. note::
+  Whenever possible, be sure to specify the ``meta`` argument to
+  :func:`from_map`. If this argument is excluded, Dask will need to
+  materialize the first partition eagerly. If a large RMM pool is in
+  use on the first visible device, this eager execution on the client
+  may lead to an OOM error.
+
+
+Sorting, Joining, and Grouping
+------------------------------
+
+Sorting, joining, and grouping operations all have the potential to
+require the global shuffling of data between distinct partitions.
+When the initial data fits comfortably in global GPU memory, these
+"all-to-all" operations are typically bound by worker-to-worker
+communication. When the data is larger than global GPU memory, the
+bottleneck is typically device-to-host memory spilling.
+
+Although every workflow is different, the following guidelines
+are often recommended:
+
+* `Use a distributed cluster with Dask-CUDA workers <Use Dask-CUDA>`_
+* `Use native cuDF spilling whenever possible <Enable cuDF Spilling>`_
+* Avoid shuffling whenever possible
+  * Use ``split_out=1`` for low-cardinality groupby aggregations
+  * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``)
+* `Use UCX <https://docs.rapids.ai/api/dask-cuda/nightly/examples/ucx/>`__ if communication is a bottleneck.
+
+.. note::
+  UCX enables Dask-CUDA workers to communicate using high-performance
+  tansport technologies like `NVLink <https://www.nvidia.com/en-us/data-center/nvlink/>`__
+  and Infiniband. Without UCX, inter-process communication will rely
+  on TCP sockets.
+
+
+User-defined functions
+----------------------
+
+Most real-world Dask DataFrame workflows use `map_partitions
+<https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html>`__
+to map user-defined functions across every partition of the underlying data.
+This API is a fantastic way to apply custom operations in an intuitive and
+scalable way. With that said, the :func:`map_partitions` method will produce
+an opaque DataFrame expression that blocks the query-planning `optimizer
+<https://docs.dask.org/en/stable/dataframe-optimizer.html>`__ from performing
+useful optimizations (like projection and filter pushdown).
+
+Since column-projection pushdown is often the most effective optimization,
+it is important to select the necessary columns both before and after calling
+:func:`map_partitions`. You can also add explicit filter operations to further
+mitigate the loss of filter pushdown.
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 7fe6cbd45fa..23ca7e49753 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for
 .. note::
   Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
   or multi-node execution on their own. You must also deploy a
-  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  `dask.distributed <https://distributed.dask.org/en/stable/>`__ cluster
   to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
   <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
   setup of the cluster, taking advantage of all features of the GPU
@@ -29,6 +29,10 @@ minutes to Dask
 by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
+After reviewing the sections below, please see the
+:ref:`Best Practices <best-practices>` page for further guidance on
+using Dask cuDF effectively.
+
 
 Using Dask cuDF
 ---------------
@@ -36,7 +40,7 @@ Using Dask cuDF
 The Dask DataFrame API (Recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Simply use the `Dask configuration <dask:configuration>` system to
+Simply use the `Dask configuration <dask:configuration>`__ system to
 set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
 this can be achieved like so::
 
@@ -50,14 +54,14 @@ environment before running your code.
 Once this is done, the public Dask DataFrame API will leverage
 ``cudf`` automatically when a new DataFrame collection is created
 from an on-disk format using any of the following ``dask.dataframe``
-functions::
+functions:
 
-* :func:`dask.dataframe.read_parquet`
-* :func:`dask.dataframe.read_json`
-* :func:`dask.dataframe.read_csv`
-* :func:`dask.dataframe.read_orc`
-* :func:`dask.dataframe.read_hdf`
-* :func:`dask.dataframe.from_dict`
+* :func:`read_parquet`
+* :func:`read_json`
+* :func:`read_csv`
+* :func:`read_orc`
+* :func:`read_hdf`
+* :func:`from_dict`
 
 For example::
 
@@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
 Also, using some parts of the explicit API are incompatible with
 automatic query planning (see the next section).
 
-The explicit Dask cuDF API
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Query Planning
+~~~~~~~~~~~~~~
 
 Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
 As long as the ``"dataframe.query-planning"`` configuration is set to
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e27c595edda..99e4c21df8a 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -599,7 +599,6 @@ cdef class Column:
             children=tuple(children)
         )
 
-    #  TODO: Actually support exposed data pointers.
     @staticmethod
     def from_pylibcudf(
         col, bint data_ptr_exposed=False
@@ -616,7 +615,7 @@ cdef class Column:
         col : pylibcudf.Column
             The object to copy.
         data_ptr_exposed : bool
-            This parameter is not yet supported
+            Whether the data buffer is exposed.
 
         Returns
         -------
@@ -639,16 +638,18 @@ cdef class Column:
         dtype = dtype_from_pylibcudf_column(col)
 
         return cudf.core.column.build_column(
-            data=as_buffer(col.data().obj) if col.data() is not None else None,
+            data=as_buffer(
+                col.data().obj, exposed=data_ptr_exposed
+            ) if col.data() is not None else None,
             dtype=dtype,
             size=col.size(),
             mask=as_buffer(
-                col.null_mask().obj
+                col.null_mask().obj, exposed=data_ptr_exposed
             ) if col.null_mask() is not None else None,
             offset=col.offset(),
             null_count=col.null_count(),
             children=tuple([
-                Column.from_pylibcudf(child)
+                Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed)
                 for child in col.children()
             ])
         )
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 483250dd36f..bc5e085ec39 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
@@ -38,43 +40,9 @@ def add_months(Column col, Column months):
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        if field == "year":
-            c_result = move(libcudf_datetime.extract_year(col_view))
-        elif field == "month":
-            c_result = move(libcudf_datetime.extract_month(col_view))
-        elif field == "day":
-            c_result = move(libcudf_datetime.extract_day(col_view))
-        elif field == "weekday":
-            c_result = move(libcudf_datetime.extract_weekday(col_view))
-        elif field == "hour":
-            c_result = move(libcudf_datetime.extract_hour(col_view))
-        elif field == "minute":
-            c_result = move(libcudf_datetime.extract_minute(col_view))
-        elif field == "second":
-            c_result = move(libcudf_datetime.extract_second(col_view))
-        elif field == "millisecond":
-            c_result = move(
-                libcudf_datetime.extract_millisecond_fraction(col_view)
-            )
-        elif field == "microsecond":
-            c_result = move(
-                libcudf_datetime.extract_microsecond_fraction(col_view)
-            )
-        elif field == "nanosecond":
-            c_result = move(
-                libcudf_datetime.extract_nanosecond_fraction(col_view)
-            )
-        elif field == "day_of_year":
-            c_result = move(libcudf_datetime.day_of_year(col_view))
-        else:
-            raise ValueError(f"Invalid datetime field: '{field}'")
-
-    result = Column.from_unique_ptr(move(c_result))
+    result = Column.from_pylibcudf(
+        plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
+    )
 
     if field == "weekday":
         # Pandas counts Monday-Sunday as 0-6
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 8d463829a19..60a6795a402 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     to_booleans as cpp_to_booleans,
 )
 from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
-    from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
-    to_timestamps as cpp_to_timestamps,
-)
-from pylibcudf.libcudf.strings.convert.convert_durations cimport (
-    from_durations as cpp_from_durations,
-    to_durations as cpp_to_durations,
 )
 from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
@@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id
 
 from cudf._lib.types cimport underlying_type_t_type_id
 
+import pylibcudf as plc
+
 import cudf
 
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
 
 def floating_to_string(Column input_col):
     cdef column_view input_column_view = input_col.view()
@@ -522,19 +520,14 @@ def int2timestamp(
     A Column with date-time represented in string format
 
     """
-    cdef column_view input_column_view = input_col.view()
     cdef string c_timestamp_format = format.encode("UTF-8")
-    cdef column_view input_strings_names = names.view()
-
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_timestamps(
-                input_column_view,
-                c_timestamp_format,
-                input_strings_names))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_datetime.from_timestamps(
+            input_col.to_pylibcudf(mode="read"),
+            c_timestamp_format,
+            names.to_pylibcudf(mode="read")
+        )
+    )
 
 
 def timestamp2int(Column input_col, dtype, format):
@@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format):
     A Column with string represented in date-time format
 
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
+    dtype = dtype_to_pylibcudf_type(dtype)
+    cdef string c_timestamp_format = format.encode('UTF-8')
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_datetime.to_timestamps(
+            input_col.to_pylibcudf(mode="read"),
+            dtype,
+            c_timestamp_format
         )
     )
-    cdef data_type out_type = data_type(tid)
-    cdef string c_timestamp_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_timestamps(
-                input_column_view,
-                out_type,
-                c_timestamp_format))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 def istimestamp(Column input_col, str format):
@@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format):
     A Column with string represented in TimeDelta format
 
     """
-    cdef column_view input_column_view = input_col.view()
-    cdef type_id tid = <type_id> (
-        <underlying_type_t_type_id> (
-            SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype]
+    dtype = dtype_to_pylibcudf_type(dtype)
+    cdef string c_timestamp_format = format.encode('UTF-8')
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_durations.to_durations(
+            input_col.to_pylibcudf(mode="read"),
+            dtype,
+            c_timestamp_format
         )
     )
-    cdef data_type out_type = data_type(tid)
-    cdef string c_duration_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_to_durations(
-                input_column_view,
-                out_type,
-                c_duration_format))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 def int2timedelta(Column input_col, str format):
@@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format):
 
     """
 
-    cdef column_view input_column_view = input_col.view()
     cdef string c_duration_format = format.encode('UTF-8')
-    cdef unique_ptr[column] c_result
-    with nogil:
-        c_result = move(
-            cpp_from_durations(
-                input_column_view,
-                c_duration_format))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.convert.convert_durations.from_durations(
+            input_col.to_pylibcudf(mode="read"),
+            c_duration_format
+        )
+    )
 
 
 def int2ip(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index acf52cb7b9f..38ecb21a94c 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -25,23 +26,14 @@ def strip(Column source_strings,
     """
 
     cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.strip.strip(
+            source_strings.to_pylibcudf(mode="read"),
+            plc.strings.SideType.BOTH,
+            repl.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_strip(
-            source_view,
-            side_type.BOTH,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def lstrip(Column source_strings,
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index baa08a545ec..40d0c9eac3a 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -3,41 +3,26 @@
 from numba.np import numpy_support
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf.core._internals.expressions import parse_expression
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
 from cython.operator cimport dereference
-from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 cimport pylibcudf.libcudf.transform as libcudf_transform
 from pylibcudf cimport transform as plc_transform
 from pylibcudf.expressions cimport Expression
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.expressions cimport expression
-from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-    type_id,
-)
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_table_view,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport table_view_from_columns
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -46,17 +31,8 @@ def bools_to_mask(Column col):
     Given an int8 (boolean) column, compress the data from booleans to bits and
     return a Buffer
     """
-    cdef column_view col_view = col.view()
-    cdef pair[unique_ptr[device_buffer], size_type] cpp_out
-    cdef unique_ptr[device_buffer] up_db
-
-    with nogil:
-        cpp_out = move(libcudf_transform.bools_to_mask(col_view))
-        up_db = move(cpp_out.first)
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(rmm_db)
-    return buf
+    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
@@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
     if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
         raise TypeError("mask_buffer is not an instance of "
                         "cudf.core.buffer.Buffer")
-    cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(
-        mask_buffer.get_ptr(mode="read")
+    plc_column = plc_transform.mask_to_bools(
+        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
     )
-
-    cdef unique_ptr[column] result
-    with nogil:
-        result = move(
-            libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit)
-        )
-
-    return Column.from_unique_ptr(move(result))
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    (mask, _) = plc_transform.nans_to_nulls(
+    mask, _ = plc_transform.nans_to_nulls(
         input.to_pylibcudf(mode="read")
     )
     return as_buffer(mask)
@@ -91,80 +60,45 @@ def nans_to_nulls(Column input):
 
 @acquire_spill_lock()
 def transform(Column input, op):
-    cdef column_view c_input = input.view()
-    cdef string c_str
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
     nb_type = numpy_support.from_dtype(input.dtype)
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
-    c_str = compiled_op[0].encode('UTF-8')
     np_dtype = cudf.dtype(compiled_op[1])
 
-    try:
-        c_tid = <type_id> (
-            <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-                np_dtype
-            ]
-        )
-        c_dtype = data_type(c_tid)
-
-    except KeyError:
-        raise TypeError(
-            "Result of window function has unsupported dtype {}"
-            .format(np_dtype)
-        )
-
-    with nogil:
-        c_output = move(libcudf_transform.transform(
-            c_input,
-            c_str,
-            c_dtype,
-            True
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
+    plc_column = plc_transform.transform(
+        input.to_pylibcudf(mode="read"),
+        compiled_op[0],
+        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+        True
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 def table_encode(list source_columns):
-    cdef table_view c_input = table_view_from_columns(source_columns)
-    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
-
-    with nogil:
-        c_result = move(libcudf_transform.encode(c_input))
+    plc_table, plc_column = plc_transform.encode(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
+    )
 
     return (
-        columns_from_unique_ptr(move(c_result.first)),
-        Column.from_unique_ptr(move(c_result.second))
+        [Column.from_pylibcudf(col) for col in plc_table.columns()],
+        Column.from_pylibcudf(plc_column)
     )
 
 
 def one_hot_encode(Column input_column, Column categories):
-    cdef column_view c_view_input = input_column.view()
-    cdef column_view c_view_categories = categories.view()
-    cdef pair[unique_ptr[column], table_view] c_result
-
-    with nogil:
-        c_result = move(
-            libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
-        )
-
-    # Notice, the data pointer of `owner` has been exposed
-    # through `c_result.second` at this point.
-    owner = Column.from_unique_ptr(
-        move(c_result.first), data_ptr_exposed=True
-    )
-
-    pylist_categories = categories.to_arrow().to_pylist()
-    encodings, _ = data_from_table_view(
-        move(c_result.second),
-        owner=owner,
-        column_names=[
-            x if x is not None else '<NA>' for x in pylist_categories
-        ]
+    plc_table = plc_transform.one_hot_encode(
+        input_column.to_pylibcudf(mode="read"),
+        categories.to_pylibcudf(mode="read"),
     )
-    return encodings
+    result_columns = [
+        Column.from_pylibcudf(col, data_ptr_exposed=True)
+        for col in plc_table.columns()
+    ]
+    result_labels = [
+        x if x is not None else '<NA>'
+        for x in categories.to_arrow().to_pylist()
+    ]
+    return dict(zip(result_labels, result_columns))
 
 
 @acquire_spill_lock()
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index 41d06f8631b..c1317e8f467 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,10 +10,14 @@
 
 from __future__ import annotations
 
+# Check we have a supported polars version
+import cudf_polars.utils.versions as v
 from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
+del v
+
 __all__: list[str] = [
     "execute_with_cudf",
     "translate_ir",
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index f31193aa938..76816ee0a61 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -5,19 +5,26 @@
 
 from __future__ import annotations
 
+import contextlib
 import os
 import warnings
-from functools import partial
+from functools import cache, partial
 from typing import TYPE_CHECKING
 
 import nvtx
 
-from polars.exceptions import PerformanceWarning
+from polars.exceptions import ComputeError, PerformanceWarning
+
+import rmm
+from rmm._cuda import gpu
 
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
+
     import polars as pl
+    from polars import GPUEngine
 
     from cudf_polars.dsl.ir import IR
     from cudf_polars.typing import NodeTraverser
@@ -25,23 +32,126 @@
 __all__: list[str] = ["execute_with_cudf"]
 
 
+@cache
+def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource:
+    """
+    Return the default memory resource for cudf-polars.
+
+    Parameters
+    ----------
+    device
+        Disambiguating device id when selecting the device. Must be
+        the active device when this function is called.
+
+    Returns
+    -------
+    rmm.mr.DeviceMemoryResource
+        The default memory resource that cudf-polars uses. Currently
+        an async pool resource.
+    """
+    try:
+        return rmm.mr.CudaAsyncMemoryResource()
+    except RuntimeError as e:  # pragma: no cover
+        msg, *_ = e.args
+        if (
+            msg.startswith("RMM failure")
+            and msg.find("not supported with this CUDA driver/runtime version") > -1
+        ):
+            raise ComputeError(
+                "GPU engine requested, but incorrect cudf-polars package installed. "
+                "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` "
+                "and install `cudf-polars-cu11`"
+            ) from None
+        else:
+            raise
+
+
+@contextlib.contextmanager
+def set_memory_resource(
+    mr: rmm.mr.DeviceMemoryResource | None,
+) -> Generator[rmm.mr.DeviceMemoryResource, None, None]:
+    """
+    Set the current memory resource for an execution block.
+
+    Parameters
+    ----------
+    mr
+        Memory resource to use. If `None`, calls :func:`default_memory_resource`
+        to obtain an mr on the currently active device.
+
+    Returns
+    -------
+    Memory resource used.
+
+    Notes
+    -----
+    At exit, the memory resource is restored to whatever was current
+    at entry. If a memory resource is provided, it must be valid to
+    use with the currently active device.
+    """
+    if mr is None:
+        device: int = gpu.getDevice()
+        mr = default_memory_resource(device)
+    previous = rmm.mr.get_current_device_resource()
+    rmm.mr.set_current_device_resource(mr)
+    try:
+        yield mr
+    finally:
+        rmm.mr.set_current_device_resource(previous)
+
+
+@contextlib.contextmanager
+def set_device(device: int | None) -> Generator[int, None, None]:
+    """
+    Set the device the query is executed on.
+
+    Parameters
+    ----------
+    device
+        Device to use. If `None`, uses the current device.
+
+    Returns
+    -------
+    Device active for the execution of the block.
+
+    Notes
+    -----
+    At exit, the device is restored to whatever was current at entry.
+    """
+    previous: int = gpu.getDevice()
+    if device is not None:
+        gpu.setDevice(device)
+    try:
+        yield previous
+    finally:
+        gpu.setDevice(previous)
+
+
 def _callback(
     ir: IR,
     with_columns: list[str] | None,
     pyarrow_predicate: str | None,
     n_rows: int | None,
+    *,
+    device: int | None,
+    memory_resource: int | None,
 ) -> pl.DataFrame:
     assert with_columns is None
     assert pyarrow_predicate is None
     assert n_rows is None
-    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+    with (
+        nvtx.annotate(message="ExecuteIR", domain="cudf_polars"),
+        # Device must be set before memory resource is obtained.
+        set_device(device),
+        set_memory_resource(memory_resource),
+    ):
         return ir.evaluate(cache={}).to_polars()
 
 
 def execute_with_cudf(
     nt: NodeTraverser,
     *,
-    raise_on_fail: bool = False,
+    config: GPUEngine,
     exception: type[Exception] | tuple[type[Exception], ...] = Exception,
 ) -> None:
     """
@@ -52,9 +162,8 @@ def execute_with_cudf(
     nt
         NodeTraverser
 
-    raise_on_fail
-        Should conversion raise an exception rather than continuing
-        without setting a callback.
+    config
+        GPUEngine configuration object
 
     exception
         Optional exception, or tuple of exceptions, to catch during
@@ -62,9 +171,23 @@ def execute_with_cudf(
 
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
+    device = config.device
+    memory_resource = config.memory_resource
+    raise_on_fail = config.config.get("raise_on_fail", False)
+    if unsupported := (config.config.keys() - {"raise_on_fail"}):
+        raise ValueError(
+            f"Engine configuration contains unsupported settings {unsupported}"
+        )
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
-            nt.set_udf(partial(_callback, translate_ir(nt)))
+            nt.set_udf(
+                partial(
+                    _callback,
+                    translate_ir(nt),
+                    device=device,
+                    memory_resource=memory_resource,
+                )
+            )
     except exception as e:
         if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
             warnings.warn(
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index dd3b771e305..3fe3e5557cb 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self:
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
 
+    # TODO: Return Column once #16272 is fixed.
+    def astype(self, dtype: plc.DataType) -> plc.Column:
+        """
+        Return the backing column as the requested dtype.
+
+        Parameters
+        ----------
+        dtype
+            Datatype to cast to.
+
+        Returns
+        -------
+        Column of requested type.
+
+        Raises
+        ------
+        RuntimeError
+            If the cast is unsupported.
+
+        Notes
+        -----
+        This only produces a copy if the requested dtype doesn't match
+        the current one.
+        """
+        if self.obj.type() != dtype:
+            return plc.unary.cast(self.obj, dtype)
+        return self.obj
+
     def copy_metadata(self, from_: pl.Series, /) -> Self:
         """
         Copy metadata from a host series onto self.
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index a5c99e2bc11..f3e3862d0cc 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -7,7 +7,7 @@
 
 import itertools
 from functools import cached_property
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -45,11 +45,19 @@ def copy(self) -> Self:
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
+        # If the arrow table has empty names, from_arrow produces
+        # column_$i. But here we know there is only one such column
+        # (by construction) and it should have an empty name.
+        # https://github.com/pola-rs/polars/issues/11632
+        # To guarantee we produce correct names, we therefore
+        # serialise with names we control and rename with that map.
+        name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)}
         table: pa.Table = plc.interop.to_arrow(
             self.table,
-            [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+            [plc.interop.ColumnMetadata(name=name) for name in name_map],
         )
-        return cast(pl.DataFrame, pl.from_arrow(table)).with_columns(
+        df: pl.DataFrame = pl.from_arrow(table)
+        return df.rename(name_map).with_columns(
             *(
                 pl.col(c.name).set_sorted(
                     descending=c.order == plc.types.Order.DESCENDING
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index e1b4d30b76b..c401e5a2f17 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -21,8 +21,10 @@
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
+import pyarrow.compute as pc
 import pylibcudf as plc
 
+from polars.exceptions import InvalidOperationError
 from polars.polars import _expr_nodes as pl_expr
 
 from cudf_polars.containers import Column, NamedColumn
@@ -477,12 +479,6 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
-        if (
-            self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All)
-            and not self.options[0]
-        ):
-            # With ignore_nulls == False, polars uses Kleene logic
-            raise NotImplementedError(f"Kleene logic for {self.name}")
         if self.name == pl_expr.BooleanFunction.IsIn and not all(
             c.dtype == self.children[0].dtype for c in self.children
         ):
@@ -577,20 +573,31 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         ]
-        if self.name == pl_expr.BooleanFunction.Any:
+        # Kleene logic for Any (OR) and All (AND) if ignore_nulls is
+        # False
+        if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All):
+            (ignore_nulls,) = self.options
             (column,) = columns
-            return Column(
-                plc.Column.from_scalar(
-                    plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
-                )
-            )
-        elif self.name == pl_expr.BooleanFunction.All:
-            (column,) = columns
-            return Column(
-                plc.Column.from_scalar(
-                    plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
-                )
-            )
+            is_any = self.name == pl_expr.BooleanFunction.Any
+            agg = plc.aggregation.any() if is_any else plc.aggregation.all()
+            result = plc.reduce.reduce(column.obj, agg, self.dtype)
+            if not ignore_nulls and column.obj.null_count() > 0:
+                #      Truth tables
+                #     Any         All
+                #   | F U T     | F U T
+                # --+------   --+------
+                # F | F U T   F | F F F
+                # U | U U T   U | F U U
+                # T | T T T   T | F U T
+                #
+                # If the input null count was non-zero, we must
+                # post-process the result to insert the correct value.
+                h_result = plc.interop.to_arrow(result).as_py()
+                if is_any and not h_result or not is_any and h_result:
+                    # Any                     All
+                    # False || Null => Null   True && Null => Null
+                    return Column(plc.Column.all_null_like(column.obj, 1))
+            return Column(plc.Column.from_scalar(result, 1))
         if self.name == pl_expr.BooleanFunction.IsNull:
             (column,) = columns
             return Column(plc.unary.is_null(column.obj))
@@ -598,13 +605,19 @@ def do_evaluate(
             (column,) = columns
             return Column(plc.unary.is_valid(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsNan:
-            # TODO: copy over null mask since is_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_nan(column.obj))
+            return Column(
+                plc.unary.is_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
         elif self.name == pl_expr.BooleanFunction.IsNotNan:
-            # TODO: copy over null mask since is_not_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_not_nan(column.obj))
+            return Column(
+                plc.unary.is_not_nan(column.obj).with_mask(
+                    column.obj.null_mask(), column.obj.null_count()
+                )
+            )
         elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
             (column,) = columns
             return self._distinct(
@@ -654,26 +667,22 @@ def do_evaluate(
                 ),
             )
         elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            if any(c.obj.null_count() > 0 for c in columns):
-                raise NotImplementedError("Kleene logic for all_horizontal")
             return Column(
                 reduce(
                     partial(
                         plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.BITWISE_AND,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
                 )
             )
         elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            if any(c.obj.null_count() > 0 for c in columns):
-                raise NotImplementedError("Kleene logic for any_horizontal")
             return Column(
                 reduce(
                     partial(
                         plc.binaryop.binary_operation,
-                        op=plc.binaryop.BinaryOperator.BITWISE_OR,
+                        op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
@@ -694,7 +703,7 @@ def do_evaluate(
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options", "children", "_regex_program")
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
@@ -713,12 +722,18 @@ def __init__(
 
     def _validate_input(self):
         if self.name not in (
-            pl_expr.StringFunction.Lowercase,
-            pl_expr.StringFunction.Uppercase,
-            pl_expr.StringFunction.EndsWith,
-            pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Replace,
+            pl_expr.StringFunction.ReplaceMany,
             pl_expr.StringFunction.Slice,
+            pl_expr.StringFunction.Strptime,
+            pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+            pl_expr.StringFunction.Uppercase,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -732,11 +747,65 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+                pattern = self.children[1].value.as_py()
+                try:
+                    self._regex_program = plc.strings.regex_program.RegexProgram.create(
+                        pattern,
+                        flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                    )
+                except RuntimeError as e:
+                    raise NotImplementedError(
+                        f"Unsupported regex {pattern} for GPU engine."
+                    ) from e
+        elif self.name == pl_expr.StringFunction.Replace:
+            _, literal = self.options
+            if not literal:
+                raise NotImplementedError("literal=False is not supported for replace")
+            if not all(isinstance(expr, Literal) for expr in self.children[1:]):
+                raise NotImplementedError("replace only supports scalar target")
+            target = self.children[1]
+            if target.value == pa.scalar("", type=pa.string()):
+                raise NotImplementedError(
+                    "libcudf replace does not support empty strings"
+                )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            (ascii_case_insensitive,) = self.options
+            if ascii_case_insensitive:
+                raise NotImplementedError(
+                    "ascii_case_insensitive not implemented for replace_many"
+                )
+            if not all(
+                isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:]
+            ):
+                raise NotImplementedError("replace_many only supports literal inputs")
+            target = self.children[1]
+            if pc.any(pc.equal(target.value, "")).as_py():
+                raise NotImplementedError(
+                    "libcudf replace_many is implemented differently from polars "
+                    "for empty strings"
+                )
         elif self.name == pl_expr.StringFunction.Slice:
             if not all(isinstance(child, Literal) for child in self.children[1:]):
                 raise NotImplementedError(
                     "Slice only supports literal start and stop values"
                 )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            format, _, exact, cache = self.options
+            if cache:
+                raise NotImplementedError("Strptime cache is a CPU feature")
+            if format is None:
+                raise NotImplementedError("Strptime format is required")
+            if not exact:
+                raise NotImplementedError("Strptime does not support exact=False")
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            if not isinstance(self.children[1], Literal):
+                raise NotImplementedError(
+                    "strip operations only support scalar patterns"
+                )
 
     def do_evaluate(
         self,
@@ -759,12 +828,10 @@ def do_evaluate(
                     else pat.obj
                 )
                 return Column(plc.strings.find.contains(column.obj, pattern))
-            assert isinstance(arg, Literal)
-            prog = plc.strings.regex_program.RegexProgram.create(
-                arg.value.as_py(),
-                flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-            )
-            return Column(plc.strings.contains.contains_re(column.obj, prog))
+            else:
+                return Column(
+                    plc.strings.contains.contains_re(column.obj, self._regex_program)
+                )
         elif self.name == pl_expr.StringFunction.Slice:
             child, expr_offset, expr_length = self.children
             assert isinstance(expr_offset, Literal)
@@ -795,6 +862,22 @@ def do_evaluate(
                     plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
                 )
             )
+        elif self.name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (
+                c.evaluate(df, context=context, mapping=mapping) for c in self.children
+            )
+            if self.name == pl_expr.StringFunction.StripCharsStart:
+                side = plc.strings.SideType.LEFT
+            elif self.name == pl_expr.StringFunction.StripCharsEnd:
+                side = plc.strings.SideType.RIGHT
+            else:
+                side = plc.strings.SideType.BOTH
+            return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar))
+
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -825,6 +908,51 @@ def do_evaluate(
                     else prefix.obj,
                 )
             )
+        elif self.name == pl_expr.StringFunction.Strptime:
+            # TODO: ignores ambiguous
+            format, strict, exact, cache = self.options
+            col = self.children[0].evaluate(df, context=context, mapping=mapping)
+
+            is_timestamps = plc.strings.convert.convert_datetime.is_timestamp(
+                col.obj, format.encode()
+            )
+
+            if strict:
+                if not plc.interop.to_arrow(
+                    plc.reduce.reduce(
+                        is_timestamps,
+                        plc.aggregation.all(),
+                        plc.DataType(plc.TypeId.BOOL8),
+                    )
+                ).as_py():
+                    raise InvalidOperationError("conversion from `str` failed.")
+            else:
+                not_timestamps = plc.unary.unary_operation(
+                    is_timestamps, plc.unary.UnaryOperator.NOT
+                )
+
+                null = plc.interop.from_arrow(pa.scalar(None, type=pa.string()))
+                res = plc.copying.boolean_mask_scatter(
+                    [null], plc.Table([col.obj]), not_timestamps
+                )
+                return Column(
+                    plc.strings.convert.convert_datetime.to_timestamps(
+                        res.columns()[0], self.dtype, format.encode()
+                    )
+                )
+        elif self.name == pl_expr.StringFunction.Replace:
+            column, target, repl = columns
+            n, _ = self.options
+            return Column(
+                plc.strings.replace.replace(
+                    column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n
+                )
+            )
+        elif self.name == pl_expr.StringFunction.ReplaceMany:
+            column, target, repl = columns
+            return Column(
+                plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj)
+            )
         raise NotImplementedError(
             f"StringFunction {self.name}"
         )  # pragma: no cover; handled by init raising
@@ -832,6 +960,18 @@ def do_evaluate(
 
 class TemporalFunction(Expr):
     __slots__ = ("name", "options", "children")
+    _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
+        pl_expr.TemporalFunction.Year: "year",
+        pl_expr.TemporalFunction.Month: "month",
+        pl_expr.TemporalFunction.Day: "day",
+        pl_expr.TemporalFunction.WeekDay: "weekday",
+        pl_expr.TemporalFunction.Hour: "hour",
+        pl_expr.TemporalFunction.Minute: "minute",
+        pl_expr.TemporalFunction.Second: "second",
+        pl_expr.TemporalFunction.Millisecond: "millisecond",
+        pl_expr.TemporalFunction.Microsecond: "microsecond",
+        pl_expr.TemporalFunction.Nanosecond: "nanosecond",
+    }
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
@@ -846,8 +986,8 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
-        if self.name != pl_expr.TemporalFunction.Year:
-            raise NotImplementedError(f"String function {self.name}")
+        if self.name not in self._COMPONENT_MAP:
+            raise NotImplementedError(f"Temporal function {self.name}")
 
     def do_evaluate(
         self,
@@ -861,12 +1001,59 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         ]
-        if self.name == pl_expr.TemporalFunction.Year:
-            (column,) = columns
-            return Column(plc.datetime.extract_year(column.obj))
-        raise NotImplementedError(
-            f"TemporalFunction {self.name}"
-        )  # pragma: no cover; init trips first
+        (column,) = columns
+        if self.name == pl_expr.TemporalFunction.Microsecond:
+            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
+            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
+            millis_as_micros = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.DataType(plc.TypeId.INT32),
+            )
+            total_micros = plc.binaryop.binary_operation(
+                micros,
+                millis_as_micros,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_micros)
+        elif self.name == pl_expr.TemporalFunction.Nanosecond:
+            millis = plc.datetime.extract_datetime_component(column.obj, "millisecond")
+            micros = plc.datetime.extract_datetime_component(column.obj, "microsecond")
+            nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond")
+            millis_as_nanos = plc.binaryop.binary_operation(
+                millis,
+                plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            micros_as_nanos = plc.binaryop.binary_operation(
+                micros,
+                plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())),
+                plc.binaryop.BinaryOperator.MUL,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                nanos,
+                millis_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            total_nanos = plc.binaryop.binary_operation(
+                total_nanos,
+                micros_as_nanos,
+                plc.binaryop.BinaryOperator.ADD,
+                plc.types.DataType(plc.types.TypeId.INT32),
+            )
+            return Column(total_nanos)
+
+        return Column(
+            plc.datetime.extract_datetime_component(
+                column.obj,
+                self._COMPONENT_MAP[self.name],
+            )
+        )
 
 
 class UnaryFunction(Expr):
@@ -874,6 +1061,51 @@ class UnaryFunction(Expr):
     _non_child = ("dtype", "name", "options")
     children: tuple[Expr, ...]
 
+    # Note: log, and pow are handled via translation to binops
+    _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
+        "sin": plc.unary.UnaryOperator.SIN,
+        "cos": plc.unary.UnaryOperator.COS,
+        "tan": plc.unary.UnaryOperator.TAN,
+        "arcsin": plc.unary.UnaryOperator.ARCSIN,
+        "arccos": plc.unary.UnaryOperator.ARCCOS,
+        "arctan": plc.unary.UnaryOperator.ARCTAN,
+        "sinh": plc.unary.UnaryOperator.SINH,
+        "cosh": plc.unary.UnaryOperator.COSH,
+        "tanh": plc.unary.UnaryOperator.TANH,
+        "arcsinh": plc.unary.UnaryOperator.ARCSINH,
+        "arccosh": plc.unary.UnaryOperator.ARCCOSH,
+        "arctanh": plc.unary.UnaryOperator.ARCTANH,
+        "exp": plc.unary.UnaryOperator.EXP,
+        "sqrt": plc.unary.UnaryOperator.SQRT,
+        "cbrt": plc.unary.UnaryOperator.CBRT,
+        "ceil": plc.unary.UnaryOperator.CEIL,
+        "floor": plc.unary.UnaryOperator.FLOOR,
+        "abs": plc.unary.UnaryOperator.ABS,
+        "bit_invert": plc.unary.UnaryOperator.BIT_INVERT,
+        "not": plc.unary.UnaryOperator.NOT,
+    }
+    _supported_misc_fns = frozenset(
+        {
+            "drop_nulls",
+            "fill_null",
+            "mask_nans",
+            "round",
+            "set_sorted",
+            "unique",
+        }
+    )
+    _supported_cum_aggs = frozenset(
+        {
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+        }
+    )
+    _supported_fns = frozenset().union(
+        _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys()
+    )
+
     def __init__(
         self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
     ) -> None:
@@ -881,15 +1113,15 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in (
-            "mask_nans",
-            "round",
-            "setsorted",
-            "unique",
-            "dropnull",
-            "fill_null",
-        ):
+
+        if self.name not in UnaryFunction._supported_fns:
             raise NotImplementedError(f"Unary function {name=}")
+        if self.name in UnaryFunction._supported_cum_aggs:
+            (reverse,) = self.options
+            if reverse:
+                raise NotImplementedError(
+                    "reverse=True is not supported for cumulative aggregations"
+                )
 
     def do_evaluate(
         self,
@@ -947,7 +1179,7 @@ def do_evaluate(
             if maintain_order:
                 return Column(column).sorted_like(values)
             return Column(column)
-        elif self.name == "setsorted":
+        elif self.name == "set_sorted":
             (column,) = (
                 child.evaluate(df, context=context, mapping=mapping)
                 for child in self.children
@@ -974,7 +1206,7 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
-        elif self.name == "dropnull":
+        elif self.name == "drop_nulls":
             (column,) = (
                 child.evaluate(df, context=context, mapping=mapping)
                 for child in self.children
@@ -994,13 +1226,65 @@ def do_evaluate(
                 )
                 arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
             return Column(plc.replace.replace_nulls(column.obj, arg))
-
+        elif self.name in self._OP_MAPPING:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if column.obj.type().id() != self.dtype.id():
+                arg = plc.unary.cast(column.obj, self.dtype)
+            else:
+                arg = column.obj
+            return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name]))
+        elif self.name in UnaryFunction._supported_cum_aggs:
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            plc_col = column.obj
+            col_type = column.obj.type()
+            # cum_sum casts
+            # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention
+            # Bool -> UInt32
+            # cum_prod casts integer dtypes < int64 and bool to int64
+            # See:
+            # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs
+            if (
+                self.name == "cum_sum"
+                and col_type.id()
+                in {
+                    plc.types.TypeId.INT8,
+                    plc.types.TypeId.UINT8,
+                    plc.types.TypeId.INT16,
+                    plc.types.TypeId.UINT16,
+                }
+            ) or (
+                self.name == "cum_prod"
+                and plc.traits.is_integral(col_type)
+                and plc.types.size_of(col_type) <= 4
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.INT64)
+                )
+            elif (
+                self.name == "cum_sum"
+                and column.obj.type().id() == plc.types.TypeId.BOOL8
+            ):
+                plc_col = plc.unary.cast(
+                    plc_col, plc.types.DataType(plc.types.TypeId.UINT32)
+                )
+            if self.name == "cum_sum":
+                agg = plc.aggregation.sum()
+            elif self.name == "cum_prod":
+                agg = plc.aggregation.product()
+            elif self.name == "cum_min":
+                agg = plc.aggregation.min()
+            elif self.name == "cum_max":
+                agg = plc.aggregation.max()
+
+            return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE))
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
+        if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs:
+            raise NotImplementedError(f"{self.name} in groupby")
         if depth == 1:
             # inside aggregation, need to pre-evaluate, groupby
             # construction has checked that we don't have nested aggs,
@@ -1187,11 +1471,7 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
-        if not (
-            plc.traits.is_fixed_width(self.dtype)
-            and plc.traits.is_fixed_width(value.dtype)
-            and plc.unary.is_supported_cast(value.dtype, self.dtype)
-        ):
+        if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
                 f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
             )
@@ -1255,6 +1535,13 @@ def __init__(
             req = plc.aggregation.variance(ddof=options)
         elif name == "count":
             req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        elif name == "quantile":
+            _, quantile = self.children
+            if not isinstance(quantile, Literal):
+                raise NotImplementedError("Only support literal quantile values")
+            req = plc.aggregation.quantile(
+                quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options]
+            )
         else:
             raise NotImplementedError(
                 f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
@@ -1286,9 +1573,18 @@ def __init__(
             "count",
             "std",
             "var",
+            "quantile",
         ]
     )
 
+    interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = {
+        "nearest": plc.types.Interpolation.NEAREST,
+        "higher": plc.types.Interpolation.HIGHER,
+        "lower": plc.types.Interpolation.LOWER,
+        "midpoint": plc.types.Interpolation.MIDPOINT,
+        "linear": plc.types.Interpolation.LINEAR,
+    }
+
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         if depth >= 1:
@@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
-        if self.request is None:
+        request = self.request
+        # These are handled specially here because we don't set up the
+        # request for the whole-frame agg because we can avoid a
+        # reduce for these.
+        if self.name == "first":
+            request = plc.aggregation.nth_element(
+                0, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        elif self.name == "last":
+            request = plc.aggregation.nth_element(
+                -1, null_handling=plc.types.NullPolicy.INCLUDE
+            )
+        if request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
@@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             # Ignore nans in these groupby aggs, do this by masking
             # nans in the input
             expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
-        return AggInfo([(expr, self.request, self)])
+        return AggInfo([(expr, request, self)])
 
     def _reduce(
         self, column: Column, *, request: plc.aggregation.Aggregation
@@ -1380,7 +1688,10 @@ def do_evaluate(
             raise NotImplementedError(
                 f"Agg in context {context}"
             )  # pragma: no cover; unreachable
-        (child,) = self.children
+
+        # Aggregations like quantiles may have additional children that were
+        # preprocessed into pylibcudf requests.
+        child = self.children[0]
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
@@ -1425,6 +1736,11 @@ def __init__(
         right: Expr,
     ) -> None:
         super().__init__(dtype)
+        if plc.traits.is_boolean(self.dtype):
+            # For boolean output types, bitand and bitor implement
+            # boolean logic, so translate. bitxor also does, but the
+            # default behaviour is correct.
+            op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
         self.op = op
         self.children = (left, right)
         if not plc.binaryop.is_supported_operation(
@@ -1436,6 +1752,15 @@ def __init__(
                 f"with output type {self.dtype.id().name}"
             )
 
+    _BOOL_KLEENE_MAPPING: ClassVar[
+        dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator]
+    ] = {
+        plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+        plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+        plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+    }
+
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
         pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e334e6f5cc5..8cd56c8ee3a 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,7 +15,6 @@
 
 import dataclasses
 import itertools
-import types
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -28,7 +27,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import Callable, MutableMapping
@@ -133,8 +132,7 @@ class IR:
 
     def __post_init__(self):
         """Validate preconditions."""
-        if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()):
-            raise NotImplementedError("Cannot make empty columns.")
+        pass  # noqa: PIE790
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -189,32 +187,42 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    file_options: Any
-    """Options for reading the file.
-
-    Attributes are:
-    - ``with_columns: list[str]`` of projected columns to return.
-    - ``n_rows: int``: Number of rows to read.
-    - ``row_index: tuple[name, offset] | None``: Add an integer index
-        column with given name.
-    """
+    with_columns: list[str]
+    """Projected columns to return."""
+    skip_rows: int
+    """Rows to skip at the start when reading."""
+    n_rows: int
+    """Number of rows to read after skipping."""
+    row_index: tuple[str, int] | None
+    """If not None add an integer index column of the given name."""
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
-        if self.typ == "ndjson" and self.file_options.n_rows is not None:
-            raise NotImplementedError("row limit in scan")
+        if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0):
+            raise NotImplementedError("row limit in scan for json reader")
+        if self.skip_rows < 0:
+            # TODO: polars has this implemented for parquet,
+            # maybe we can do this too?
+            raise NotImplementedError("slice pushdown for negative slices")
+        if self.typ == "csv" and self.skip_rows != 0:  # pragma: no cover
+            # This comes from slice pushdown, but that
+            # optimization doesn't happen right now
+            raise NotImplementedError("skipping rows in CSV reader")
         if self.cloud_options is not None and any(
             self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp")
         ):
             raise NotImplementedError(
                 "Read from cloud storage"
             )  # pragma: no cover; no test yet
+        if any(p.startswith("https://") for p in self.paths):
+            raise NotImplementedError("Read from https")
         if self.typ == "csv":
             if self.reader_options["skip_rows_after_header"] != 0:
                 raise NotImplementedError("Skipping rows after header in CSV reader")
@@ -242,13 +250,21 @@ def __post_init__(self) -> None:
                 raise NotImplementedError(
                     "ignore_errors is not supported in the JSON reader"
                 )
+        elif (
+            self.typ == "parquet"
+            and self.row_index is not None
+            and self.with_columns is not None
+            and len(self.with_columns) == 0
+        ):
+            raise NotImplementedError(
+                "Reading only parquet metadata to produce row index."
+            )
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        options = self.file_options
-        with_columns = options.with_columns
-        row_index = options.row_index
-        nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1
+        with_columns = self.with_columns
+        row_index = self.row_index
+        n_rows = self.n_rows
         if self.typ == "csv":
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
@@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             eol = chr(parse_options["eol_char"])
             if self.reader_options["schema"] is not None:
                 # Reader schema provides names
-                column_names = list(self.reader_options["schema"]["inner"].keys())
+                column_names = list(self.reader_options["schema"]["fields"].keys())
             else:
                 # file provides column names
                 column_names = None
@@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
 
             # polars skips blank lines at the beginning of the file
             pieces = []
+            read_partial = n_rows != -1
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
                 path = Path(p)
@@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     comment=comment,
                     decimal=decimal,
                     dtypes=self.schema,
-                    nrows=nrows,
+                    nrows=n_rows,
                 )
                 pieces.append(tbl_w_meta)
+                if read_partial:
+                    n_rows -= tbl_w_meta.tbl.num_rows()
+                    if n_rows <= 0:
+                        break
             tables, colnames = zip(
                 *(
                     (piece.tbl, piece.column_names(include_children=False))
@@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
-                nrows=nrows,
+                nrows=n_rows,
+                skip_rows=self.skip_rows,
             )
             df = DataFrame.from_table(
                 tbl_w_meta.tbl,
@@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
             )  # pragma: no cover; post init trips first
-        if (
-            row_index is not None
-            # TODO: remove condition when dropping support for polars 1.0
-            # https://github.com/pola-rs/polars/pull/17363
-            and row_index[0] in self.schema
-        ):
+        if row_index is not None:
             name, offset = row_index
             dtype = self.schema[name]
             step = plc.interop.from_arrow(
@@ -481,36 +498,6 @@ def evaluate(
         return DataFrame(columns)
 
 
-def placeholder_column(n: int) -> plc.Column:
-    """
-    Produce a placeholder pylibcudf column with NO BACKING DATA.
-
-    Parameters
-    ----------
-    n
-        Number of rows the column will advertise
-
-    Returns
-    -------
-    pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER.
-
-    Notes
-    -----
-    This is used to avoid allocating data for count aggregations.
-    """
-    return plc.Column(
-        plc.DataType(plc.TypeId.INT8),
-        n,
-        plc.gpumemoryview(
-            types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)})
-        ),
-        None,
-        0,
-        0,
-        [],
-    )
-
-
 @dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
@@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int:
 
     def __post_init__(self) -> None:
         """Check whether all the aggregations are implemented."""
-        if self.options.rolling is None and self.maintain_order:
-            raise NotImplementedError("Maintaining order in groupby")
+        super().__post_init__()
         if self.options.rolling:
             raise NotImplementedError(
                 "rolling window/groupby"
@@ -566,6 +552,8 @@ def __post_init__(self) -> None:
         if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
+        if len(self.keys) == 0:
+            raise NotImplementedError("dynamic groupby")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         for info in self.agg_infos:
             for pre_eval, req, rep in info.requests:
                 if pre_eval is None:
-                    col = placeholder_column(df.num_rows)
+                    # A count aggregation, doesn't touch the column,
+                    # but we need to have one. Rather than evaluating
+                    # one, just use one of the key columns.
+                    col = keys[0].obj
                 else:
                     col = pre_eval.evaluate(df).obj
                 requests.append(plc.groupby.GroupByRequest(col, [req]))
@@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
+        broadcasted = broadcast(*result_keys, *results)
+        result_keys = broadcasted[: len(result_keys)]
+        results = broadcasted[len(result_keys) :]
+        # Handle order preservation of groups
+        # like cudf classic does
+        # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743
+        if self.maintain_order and not sorted:
+            left = plc.stream_compaction.stable_distinct(
+                plc.Table([k.obj for k in keys]),
+                list(range(group_keys.num_columns())),
+                plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+            right = plc.Table([key.obj for key in result_keys])
+            _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL)
+            ordered_table = plc.copying.gather(
+                plc.Table([col.obj for col in broadcasted]),
+                indices,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+            )
+            broadcasted = [
+                NamedColumn(reordered, b.name)
+                for reordered, b in zip(
+                    ordered_table.columns(), broadcasted, strict=True
+                )
+            ]
+        return DataFrame(broadcasted).slice(self.options.slice)
 
 
 @dataclasses.dataclass
@@ -627,7 +645,7 @@ class Join(IR):
     right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
         bool,
         tuple[int, int] | None,
         str | None,
@@ -644,6 +662,7 @@ class Join(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -653,7 +672,7 @@ def __post_init__(self) -> None:
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -663,7 +682,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
-        elif how == "left":
+        elif how == "left" or how == "right":
             return (
                 plc.join.left_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -687,8 +706,7 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        else:
-            assert_never(how)
+        assert_never(how)
 
     def _reorder_maps(
         self,
@@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
+            if how == "right":
+                # Right join is a left join with the tables swapped
+                left, right = right, left
+                left_on, right_on = right_on, left_on
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            if how == "left":
+            if how == "left" or how == "right":
                 # Order of left table is preserved
                 lg, rg = self._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
@@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     )
                 )
                 right = right.discard_columns(right_on.column_names_set)
+            if how == "right":
+                # Undo the swap for right join before gluing together.
+                left, right = right, left
             right = right.rename_columns(
                 {
                     name: f"{name}{suffix}"
@@ -1065,11 +1090,13 @@ class MapFunction(IR):
             # "merge_sorted",
             "rename",
             "explode",
+            "unpivot",
         ]
     )
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1086,6 +1113,22 @@ def __post_init__(self) -> None:
                 set(new) & (set(self.df.schema.keys() - set(old)))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
+        elif self.name == "unpivot":
+            indices, pivotees, variable_name, value_name = self.options
+            value_name = "value" if value_name is None else value_name
+            variable_name = "variable" if variable_name is None else variable_name
+            if len(pivotees) == 0:
+                index = frozenset(indices)
+                pivotees = [name for name in self.df.schema if name not in index]
+            if not all(
+                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
+                for p in pivotees
+            ):
+                raise NotImplementedError(
+                    "Unpivot cannot cast all input columns to "
+                    f"{self.schema[value_name].id()}"
+                )
+            self.options = (indices, pivotees, variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return DataFrame.from_table(
                 plc.lists.explode_outer(df.table, index), df.column_names
             ).sorted_like(df, subset=subset)
+        elif self.name == "unpivot":
+            indices, pivotees, variable_name, value_name = self.options
+            npiv = len(pivotees)
+            df = self.df.evaluate(cache=cache)
+            index_columns = [
+                NamedColumn(col, name)
+                for col, name in zip(
+                    plc.reshape.tile(df.select(indices).table, npiv).columns(),
+                    indices,
+                    strict=True,
+                )
+            ]
+            (variable_column,) = plc.filling.repeat(
+                plc.Table(
+                    [
+                        plc.interop.from_arrow(
+                            pa.array(
+                                pivotees,
+                                type=plc.interop.to_arrow(self.schema[variable_name]),
+                            ),
+                        )
+                    ]
+                ),
+                df.num_rows,
+            ).columns()
+            value_column = plc.concatenate.concatenate(
+                [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns]
+            )
+            return DataFrame(
+                [
+                    *index_columns,
+                    NamedColumn(variable_column, variable_name),
+                    NamedColumn(value_column, value_name),
+                ]
+            )
         else:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
@@ -1122,6 +1200,7 @@ class Union(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
+        super().__post_init__()
         schema = self.dfs[0].schema
         if not all(s.schema == schema for s in self.dfs[1:]):
             raise NotImplementedError("Schema mismatch")
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 6dc97c7cb51..45881afe0c8 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -75,13 +75,12 @@ def _translate_ir(
 def _(
     node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.PythonScan(
-        schema,
-        node.options,
-        translate_named_expr(visitor, n=node.predicate)
-        if node.predicate is not None
-        else None,
+    scan_fn, with_columns, source_type, predicate, nrows = node.options
+    options = (scan_fn, with_columns, source_type, nrows)
+    predicate = (
+        translate_named_expr(visitor, n=predicate) if predicate is not None else None
     )
+    return ir.PythonScan(schema, options, predicate)
 
 
 @_translate_ir.register
@@ -94,13 +93,35 @@ def _(
         cloud_options = None
     else:
         reader_options, cloud_options = map(json.loads, options)
+    if (
+        typ == "csv"
+        and visitor.version()[0] == 1
+        and reader_options["schema"] is not None
+    ):
+        reader_options["schema"] = {
+            "fields": reader_options["schema"]["inner"]
+        }  # pragma: no cover; CI tests 1.7
+    file_options = node.file_options
+    with_columns = file_options.with_columns
+    n_rows = file_options.n_rows
+    if n_rows is None:
+        n_rows = -1  # All rows
+        skip_rows = 0  # Don't skip
+    else:
+        # TODO: with versioning, rename on the rust side
+        skip_rows, n_rows = n_rows
+
+    row_index = file_options.row_index
     return ir.Scan(
         schema,
         typ,
         reader_options,
         cloud_options,
         node.paths,
-        node.file_options,
+        with_columns,
+        skip_rows,
+        n_rows,
+        row_index,
         translate_named_expr(visitor, n=node.predicate)
         if node.predicate is not None
         else None,
@@ -293,10 +314,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     ctx: AbstractContextManager[None] = (
         set_node(visitor, n) if n is not None else noop_context
     )
+    # IR is versioned with major.minor, minor is bumped for backwards
+    # compatible changes (e.g. adding new nodes), major is bumped for
+    # incompatible changes (e.g. renaming nodes).
+    # Polars 1.7 changes definition of the CSV reader options schema name.
+    if (version := visitor.version()) >= (3, 0):
+        raise NotImplementedError(
+            f"No support for polars IR {version=}"
+        )  # pragma: no cover; no such version for now.
+
     with ctx:
+        polars_schema = visitor.get_schema()
         node = visitor.view_current_node()
-        schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()}
-        return _translate_ir(node, visitor, schema)
+        schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()}
+        result = _translate_ir(node, visitor, schema)
+        if any(
+            isinstance(dtype, pl.Null)
+            for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values())
+        ):
+            raise NotImplementedError(
+                f"No GPU support for {result} with Null column dtype."
+            )
+        return result
 
 
 def translate_named_expr(
@@ -345,6 +384,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
     name, *options = node.function_data
     options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
+        if name in {
+            pl_expr.StringFunction.StripChars,
+            pl_expr.StringFunction.StripCharsStart,
+            pl_expr.StringFunction.StripCharsEnd,
+        }:
+            column, chars = (translate_expr(visitor, n=n) for n in node.input)
+            if isinstance(chars, expr.Literal):
+                if chars.value == pa.scalar(""):
+                    # No-op in polars, but libcudf uses empty string
+                    # as signifier to remove whitespace.
+                    return column
+                elif chars.value == pa.scalar(None):
+                    # Polars uses None to mean "strip all whitespace"
+                    chars = expr.Literal(
+                        column.dtype,
+                        pa.scalar("", type=plc.interop.to_arrow(column.dtype)),
+                    )
+            return expr.StringFunction(dtype, name, options, column, chars)
         return expr.StringFunction(
             dtype,
             name,
@@ -369,19 +426,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.TemporalFunction):
-        return expr.TemporalFunction(
+        # functions for which evaluation of the expression may not return
+        # the same dtype as polars, either due to libcudf returning a different
+        # dtype, or due to our internal processing affecting what libcudf returns
+        needs_cast = {
+            pl_expr.TemporalFunction.Year,
+            pl_expr.TemporalFunction.Month,
+            pl_expr.TemporalFunction.Day,
+            pl_expr.TemporalFunction.WeekDay,
+            pl_expr.TemporalFunction.Hour,
+            pl_expr.TemporalFunction.Minute,
+            pl_expr.TemporalFunction.Second,
+            pl_expr.TemporalFunction.Millisecond,
+        }
+        result_expr = expr.TemporalFunction(
             dtype,
             name,
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
+        if name in needs_cast:
+            return expr.Cast(dtype, result_expr)
+        return result_expr
+
     elif isinstance(name, str):
-        return expr.UnaryFunction(
-            dtype,
-            name,
-            options,
-            *(translate_expr(visitor, n=n) for n in node.input),
-        )
+        children = (translate_expr(visitor, n=n) for n in node.input)
+        if name == "log":
+            (base,) = options
+            (child,) = children
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOG_BASE,
+                child,
+                expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))),
+            )
+        elif name == "pow":
+            return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children)
+        return expr.UnaryFunction(dtype, name, options, *children)
     raise NotImplementedError(
         f"No handler for Expr function node with {name=}"
     )  # pragma: no cover; polars raises on the rust side for now
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index d37c96a15de..a79d45899cd 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -5,12 +5,11 @@
 
 from __future__ import annotations
 
-from functools import partial
 from typing import TYPE_CHECKING
 
+from polars import GPUEngine
 from polars.testing.asserts import assert_frame_equal
 
-from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -77,21 +76,13 @@ def assert_gpu_result_equal(
     NotImplementedError
         If GPU collection failed in some way.
     """
-    if collect_kwargs is None:
-        collect_kwargs = {}
-    final_polars_collect_kwargs = collect_kwargs.copy()
-    final_cudf_collect_kwargs = collect_kwargs.copy()
-    if polars_collect_kwargs is not None:
-        final_polars_collect_kwargs.update(polars_collect_kwargs)
-    if cudf_collect_kwargs is not None:  # pragma: no cover
-        # exclude from coverage since not used ATM
-        # but this is probably still useful
-        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
-    expect = lazydf.collect(**final_polars_collect_kwargs)
-    got = lazydf.collect(
-        **final_cudf_collect_kwargs,
-        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
+    final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
+        collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
     )
+
+    expect = lazydf.collect(**final_polars_collect_kwargs)
+    engine = GPUEngine(raise_on_fail=True)
+    got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine)
     assert_frame_equal(
         expect,
         got,
@@ -134,3 +125,94 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception])
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e
     else:
         raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
+
+
+def _process_kwargs(
+    collect_kwargs: dict[OptimizationArgs, bool] | None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None,
+) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]:
+    if collect_kwargs is None:
+        collect_kwargs = {}
+    final_polars_collect_kwargs = collect_kwargs.copy()
+    final_cudf_collect_kwargs = collect_kwargs.copy()
+    if polars_collect_kwargs is not None:  # pragma: no cover; not currently used
+        final_polars_collect_kwargs.update(polars_collect_kwargs)
+    if cudf_collect_kwargs is not None:  # pragma: no cover; not currently used
+        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
+    return final_polars_collect_kwargs, final_cudf_collect_kwargs
+
+
+def assert_collect_raises(
+    lazydf: pl.LazyFrame,
+    *,
+    polars_except: type[Exception] | tuple[type[Exception], ...],
+    cudf_except: type[Exception] | tuple[type[Exception], ...],
+    collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+):
+    """
+    Assert that collecting the result of a query raises the expected exceptions.
+
+    Parameters
+    ----------
+    lazydf
+        frame to collect.
+    collect_kwargs
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_except
+        Exception or exceptions polars CPU is expected to raise.
+    cudf_except
+        Exception or exceptions polars GPU is expected to raise.
+    collect_kwargs
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_collect_kwargs
+        Keyword arguments to pass to collect for execution on polars CPU.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+    cudf_collect_kwargs
+        Keyword arguments to pass to collect for execution on cudf-polars.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+
+    Returns
+    -------
+    None
+        If both sides raise the expected exceptions.
+
+    Raises
+    ------
+    AssertionError
+        If either side did not raise the expected exceptions.
+    """
+    final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs(
+        collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs
+    )
+
+    try:
+        lazydf.collect(**final_polars_collect_kwargs)
+    except polars_except:
+        pass
+    except Exception as e:
+        raise AssertionError(
+            f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+        ) from e
+    else:
+        raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
+
+    engine = GPUEngine(raise_on_fail=True)
+    try:
+        lazydf.collect(**final_cudf_collect_kwargs, engine=engine)
+    except cudf_except:
+        pass
+    except Exception as e:
+        raise AssertionError(
+            f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}"
+        ) from e
+    else:
+        raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}")
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
new file mode 100644
index 00000000000..c40d59e6d33
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Plugin for running polars test suite setting GPU engine as default."""
+
+from __future__ import annotations
+
+from functools import partialmethod
+from typing import TYPE_CHECKING
+
+import pytest
+
+import polars
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+
+def pytest_addoption(parser: pytest.Parser):
+    """Add plugin-specific options."""
+    group = parser.getgroup(
+        "cudf-polars", "Plugin to set GPU as default engine for polars tests"
+    )
+    group.addoption(
+        "--cudf-polars-no-fallback",
+        action="store_true",
+        help="Turn off fallback to CPU when running tests (default use fallback)",
+    )
+
+
+def pytest_configure(config: pytest.Config):
+    """Enable use of this module as a pytest plugin to enable GPU collection."""
+    no_fallback = config.getoption("--cudf-polars-no-fallback")
+    collect = polars.LazyFrame.collect
+    engine = polars.GPUEngine(raise_on_fail=no_fallback)
+    polars.LazyFrame.collect = partialmethod(collect, engine=engine)
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*GPU engine does not support streaming or background collection",
+    )
+    config.addinivalue_line(
+        "filterwarnings",
+        "ignore:.*Query execution with GPU not supported",
+    )
+
+
+EXPECTED_FAILURES: Mapping[str, str] = {
+    "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
+    "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
+    "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
+    "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
+    "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
+    "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU",
+    "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
+    "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
+    "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
+    "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU",
+    "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv",
+    "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception",
+    "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
+    "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
+    "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    # Maybe flaky, order-dependent?
+    "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
+    "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+}
+
+
+def pytest_collection_modifyitems(
+    session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
+):
+    """Mark known failing tests."""
+    if config.getoption("--cudf-polars-no-fallback"):
+        # Don't xfail tests if running without fallback
+        return
+    for item in items:
+        if item.nodeid in EXPECTED_FAILURES:
+            item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index adab10bdded..240b11bdf59 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr:
         """Convert the given expression to python rep."""
         ...
 
+    def version(self) -> tuple[int, int]:
+        """The IR version as `(major, minor)`."""
+        ...
+
     def set_udf(
         self,
         callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame],
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 7f6ea1edfd9..4154a404e98 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -13,7 +13,7 @@
 
 import polars as pl
 
-__all__ = ["from_polars", "downcast_arrow_lists"]
+__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
@@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
     return typ
 
 
+def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
+    """
+    Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes.
+
+    Parameters
+    ----------
+    from_
+        Source datatype
+    to
+        Target datatype
+
+    Returns
+    -------
+    True if casting is supported, False otherwise
+    """
+    return (
+        plc.traits.is_fixed_width(to)
+        and plc.traits.is_fixed_width(from_)
+        and plc.unary.is_supported_cast(from_, to)
+    )
+
+
 @cache
 def from_polars(dtype: pl.DataType) -> plc.DataType:
     """
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index 9807cffb384..2e6efde968c 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -12,18 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
-POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
-POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
-POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
-POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
-POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
-POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
-
-POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2")
-POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1")
-POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2")
-POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1")
-
-if POLARS_VERSION < parse("1.0"):  # pragma: no cover
-    raise ImportError("cudf_polars requires py-polars v1.0 or greater.")
+POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6")
+POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6")
+POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6")
+
+if POLARS_VERSION_LT_16:
+    raise ImportError(
+        "cudf_polars requires py-polars v1.6 or greater."
+    )  # pragma: no cover
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 6cd36136bf8..103ac1a674e 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -15,8 +15,10 @@ You will need:
 
 ## Installing polars
 
-We will need to build polars from source. Until things settle down,
-live at `HEAD`.
+`cudf-polars` works with polars >= 1.3, as long as the internal IR
+version doesn't get a major version bump. So `pip install polars>=1.3`
+should work. For development, if we're adding things to the polars
+side of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in
 
 ```sh
 cd cudf/python/cudf_polars
-uv pip install --no-build-isolation --no-deps -e .
+pip install --no-build-isolation --no-deps -e .
 ```
 
 You should now be able to run the tests in the `cudf_polars` package:
@@ -69,16 +71,18 @@ pytest -v tests
 
 # Executor design
 
-The polars `LazyFrame.collect` functionality offers a
-"post-optimization" callback that may be used by a third party library
-to replace a node (or more, though we only replace a single node) in the
-optimized logical plan with a Python callback that is to deliver the
-result of evaluating the plan. This splits the execution of the plan
-into two phases. First, a symbolic phase which translates to our
-internal representation (IR). Second, an execution phase which executes
-using our IR.
-
-The translation phase receives the a low-level Rust `NodeTraverse`
+The polars `LazyFrame.collect` functionality offers configuration of
+the engine to use for collection through the `engine` argument. At a
+low level, this provides for configuration of a "post-optimization"
+callback that may be used by a third party library to replace a node
+(or more, though we only replace a single node) in the optimized
+logical plan with a Python callback that is to deliver the result of
+evaluating the plan. This splits the execution of the plan into two
+phases. First, a symbolic phase which translates to our internal
+representation (IR). Second, an execution phase which executes using
+our IR.
+
+The translation phase receives the a low-level Rust `NodeTraverser`
 object which delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
@@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at
 all: if we can translate the IR, it is assumed that evaluation will
 later succeed.
 
-The usage of the cudf-based executor is therefore, at present:
+The usage of the cudf-based executor is therefore selected with the
+gpu engine:
 
 ```python
-from cudf_polars.callback import execute_with_cudf
+import polars as pl
 
-result = q.collect(post_opt_callback=execute_with_cudf)
+result = q.collect(engine="gpu")
 ```
 
 This should either transparently run on the GPU and deliver a polars
 dataframe, or else fail (but be handled) and just run the normal CPU
-execution.
+execution. If `POLARS_VERBOSE` is true, then fallback is logged with a
+`PerformanceWarning`.
 
-If you want to fail during translation, set the keyword argument
-`raise_on_fail` to `True`:
+As well as a string argument, the engine can also be specified with a
+polars `GPUEngine` object. This allows passing more configuration in.
+Currently, the public properties are `device`, to select the device,
+and `memory_resource`, to select the RMM memory resource used for
+allocations during the collection phase.
 
+For example:
 ```python
-from functools import partial
-from cudf_polars.callback import execute_with_cudf
+import polars as pl
 
-result = q.collect(
-    post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
-)
+result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr))
+```
+
+Uses device-1, and the given memory resource. Note that the memory
+resource provided _must_ be valid for allocations on the specified
+device, no checking is performed.
+
+For debugging purposes, we can also pass undocumented keyword
+arguments, at the moment, `raise_on_fail` is also supported, which
+raises, rather than falling back, during translation:
+
+```python
+
+result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
 This is mostly useful when writing tests, since in that case we want
 any failures to propagate, rather than falling back to the CPU mode.
 
+## IR versioning
+
+On the polars side, the `NodeTraverser` object advertises an internal
+version (via `NodeTraverser.version()` as a `(major, minor)` tuple).
+`minor` version bumps are for backwards compatible changes (e.g.
+exposing new nodes), whereas `major` bumps are for incompatible
+changes. We can therefore attempt to detect the IR version
+(independently of the polars version) and dispatch, or error
+appropriately. This should be done during IR translation in
+`translate.py`.
+
 ## Adding a handler for a new plan node
 
 Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
@@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in
 
 1. `Scalar` (a wrapper around a pylibcudf `Scalar`)
 2. `Column` (a wrapper around a pylibcudf `Column`)
-3. `NamedColumn` a `Column` with an additional name
+3. `NamedColumn` (a `Column` with an additional name)
 4. `DataFrame` (a wrapper around a pylibcudf `Table`)
 
 The interfaces offered by these are somewhat in flux, but broadly
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 984b5487b98..857a8c14b2f 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.0,<1.3",
+    "polars>=1.6",
     "pylibcudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -58,6 +58,9 @@ exclude_also = [
   "class .*\\bProtocol\\):",
   "assert_never\\("
 ]
+# The cudf_polars test suite doesn't exercise the plugin, so we omit
+# it from coverage checks.
+omit = ["cudf_polars/testing/plugin.py"]
 
 [tool.ruff]
 line-length = 88
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 6b470268084..39fb44d55a5 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -9,6 +9,7 @@
 import polars as pl
 
 from cudf_polars.containers import DataFrame, NamedColumn
+from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
 def test_select_missing_raises():
@@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last):
     assert b.null_order == b_null_order
     assert c.is_sorted == plc.types.Sorted.NO
     assert df.flags == gf.to_polars().flags
+
+
+def test_empty_name_roundtrips_overlap():
+    df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]})
+    assert_gpu_result_equal(df)
+
+
+def test_empty_name_roundtrips_no_overlap():
+    df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]})
+    assert_gpu_result_equal(df)
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 245bde3acab..56055f4c6c2 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -7,15 +7,38 @@
 import polars as pl
 
 from cudf_polars.dsl import expr
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
-@pytest.fixture(params=sorted(expr.Agg._SUPPORTED))
+@pytest.fixture(
+    params=[
+        # regular aggs from Agg
+        "min",
+        "max",
+        "median",
+        "n_unique",
+        "first",
+        "last",
+        "mean",
+        "sum",
+        "count",
+        "std",
+        "var",
+        # scan aggs from UnaryFunction
+        "cum_min",
+        "cum_max",
+        "cum_prod",
+        "cum_sum",
+    ]
+)
 def agg(request):
     return request.param
 
 
-@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16])
+@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16])
 def dtype(request):
     return request.param
 
@@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted):
     if is_sorted:
         values = sorted(values, key=lambda x: -1000 if x is None else x)
 
+    if dtype.is_unsigned_integer():
+        values = pl.Series(values).abs()
+        if is_sorted:
+            values = values.sort()
+
     df = pl.LazyFrame({"a": values}, schema={"a": dtype})
     if is_sorted:
         return df.set_sorted("a")
@@ -52,6 +80,51 @@ def test_agg(df, agg):
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
 
 
+def test_bool_agg(agg, request):
+    if agg == "cum_min" or agg == "cum_max":
+        pytest.skip("Does not apply")
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=agg == "n_unique",
+            reason="Wrong dtype we get Int32, polars gets UInt32",
+        )
+    )
+    df = pl.LazyFrame({"a": [True, False, None, True]})
+    expr = getattr(pl.col("a"), agg)()
+    q = df.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs)
+def test_cum_agg_reverse_unsupported(cum_agg):
+    df = pl.LazyFrame({"a": [1, 2, 3]})
+    expr = getattr(pl.col("a"), cum_agg)(reverse=True)
+    q = df.select(expr)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)])
+@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"])
+def test_quantile(df, q, interp):
+    expr = pl.col("a").quantile(q, interp)
+    q = df.select(expr)
+
+    # https://github.com/rapidsai/cudf/issues/15852
+    check_dtypes = q.collect_schema()["a"] == pl.Float64
+    if not check_dtypes:
+        with pytest.raises(AssertionError):
+            assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
+
+
+def test_quantile_invalid_q(df):
+    expr = pl.col("a").quantile(pl.col("a"))
+    q = df.select(expr)
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max]
 )
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index 97421008669..2347021c40e 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -17,15 +17,11 @@ def has_nulls(request):
     return request.param
 
 
-@pytest.mark.parametrize(
-    "ignore_nulls",
-    [
-        pytest.param(
-            False, marks=pytest.mark.xfail(reason="No support for Kleene logic")
-        ),
-        True,
-    ],
-)
+@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"])
+def ignore_nulls(request):
+    return request.param
+
+
 def test_booleanfunction_reduction(ignore_nulls):
     ldf = pl.LazyFrame(
         {
@@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls):
     assert_gpu_result_equal(query)
 
 
+@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all])
+def test_booleanfunction_all_any_kleene(expr, ignore_nulls):
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, None],
+            "b": [False, False],
+            "c": [False, True],
+            "d": [None, False],
+            "e": pl.Series([None, None], dtype=pl.Boolean()),
+            "f": [None, True],
+            "g": [True, False],
+            "h": [True, None],
+            "i": [True, True],
+        }
+    )
+    q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls))
+    assert_gpu_result_equal(q)
+
+
 @pytest.mark.parametrize(
     "expr",
     [
@@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls):
     ids=lambda f: f"{f.__name__}()",
 )
 @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"])
-def test_boolean_function_unary(request, expr, has_nans, has_nulls):
-    if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan):
-        request.applymarker(
-            pytest.mark.xfail(
-                reason="Need to copy null mask since is_{not_}nan(null) => null"
-            )
-        )
-
+def test_boolean_function_unary(expr, has_nans, has_nulls):
     values: list[float | None] = [1, 2, 3, 4, 5]
     if has_nans:
         values[3] = float("nan")
@@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds):
     "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"]
 )
 @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"])
-def test_boolean_horizontal(request, expr, has_nulls, wide):
-    if has_nulls:
-        request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic"))
+def test_boolean_horizontal(expr, has_nulls, wide):
     ldf = pl.LazyFrame(
         {
             "a": [False, False, False, False, False, True],
@@ -164,6 +170,18 @@ def test_boolean_is_in(expr):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor])
+def test_boolean_kleene_logic(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, False, False, None, None, None, True, True, True],
+            "b": [False, None, True, False, None, True, False, None, True],
+        }
+    )
+    q = ldf.select(expr(pl.col("a"), pl.col("b")))
+    assert_gpu_result_equal(q)
+
+
 def test_boolean_is_in_raises_unsupported():
     ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
     q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 218101bf87c..c6ea29ddd38 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -9,7 +9,11 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.dsl.expr import TemporalFunction
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.mark.parametrize(
@@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype):
     assert_gpu_result_equal(query)
 
 
+datetime_extract_fields = [
+    "year",
+    "month",
+    "day",
+    "weekday",
+    "hour",
+    "minute",
+    "second",
+    "millisecond",
+    "microsecond",
+    "nanosecond",
+]
+
+
+@pytest.fixture(
+    ids=datetime_extract_fields,
+    params=[methodcaller(f) for f in datetime_extract_fields],
+)
+def field(request):
+    return request.param
+
+
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {
+            "datetimes": pl.datetime_range(
+                datetime.datetime(2020, 1, 1),
+                datetime.datetime(2021, 12, 30),
+                "3mo14h15s11ms33us999ns",
+                eager=True,
+            )
+        }
+    )
+
+    q = ldf.select(field(pl.col("datetimes").dt))
+
+    assert_gpu_result_equal(q)
+
+
+def test_datetime_extra_unsupported(monkeypatch):
+    ldf = pl.LazyFrame(
+        {
+            "datetimes": pl.datetime_range(
+                datetime.datetime(2020, 1, 1),
+                datetime.datetime(2021, 12, 30),
+                "3mo14h15s11ms33us999ns",
+                eager=True,
+            )
+        }
+    )
+
+    def unsupported_name_setter(self, value):
+        pass
+
+    def unsupported_name_getter(self):
+        return "unsupported"
+
+    monkeypatch.setattr(
+        TemporalFunction,
+        "name",
+        property(unsupported_name_getter, unsupported_name_setter),
+    )
+
+    q = ldf.select(pl.col("datetimes").dt.nanosecond())
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "field",
     [
         methodcaller("year"),
-        pytest.param(
-            methodcaller("day"),
-            marks=pytest.mark.xfail(reason="day extraction not implemented"),
-        ),
+        methodcaller("month"),
+        methodcaller("day"),
+        methodcaller("weekday"),
     ],
 )
-def test_datetime_extract(field):
+def test_date_extract(field):
+    ldf = pl.LazyFrame(
+        {
+            "dates": [
+                datetime.date(2024, 1, 1),
+                datetime.date(2024, 10, 11),
+            ]
+        }
+    )
+
     ldf = pl.LazyFrame(
         {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
     )
-    q = ldf.select(field(pl.col("dates").dt))
 
-    with pytest.raises(AssertionError):
-        # polars produces int32, libcudf produces int16 for the year extraction
-        # libcudf can lose data here.
-        # https://github.com/rapidsai/cudf/issues/16196
-        assert_gpu_result_equal(q)
+    q = ldf.select(field(pl.col("dates").dt))
 
-    assert_gpu_result_equal(q, check_dtypes=False)
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py
index 6bffa3e252c..f7c5d1bf2cd 100644
--- a/python/cudf_polars/tests/expressions/test_gather.py
+++ b/python/cudf_polars/tests/expressions/test_gather.py
@@ -6,7 +6,6 @@
 
 import polars as pl
 
-from cudf_polars import execute_with_cudf
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative):
     query = ldf.select(pl.col("a").gather(pl.col("b")))
 
     with pytest.raises(pl.exceptions.ComputeError):
-        query.collect(post_opt_callback=execute_with_cudf)
+        query.collect(engine="gpu")
diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
new file mode 100644
index 00000000000..ac3aecf88e6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(
+    params=[
+        "sin",
+        "cos",
+        "tan",
+        "arcsin",
+        "arccos",
+        "arctan",
+        "sinh",
+        "cosh",
+        "tanh",
+        "arcsinh",
+        "arccosh",
+        "arctanh",
+        "exp",
+        "sqrt",
+        "cbrt",
+        "ceil",
+        "floor",
+        "abs",
+    ]
+)
+def op(request):
+    return request.param
+
+
+@pytest.fixture(params=[pl.Int32, pl.Float32])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def ldf(with_nulls, dtype):
+    values = [1, 2, 4, 5, -2, -4, 0]
+    if with_nulls:
+        values.append(None)
+    if dtype == pl.Float32:
+        values.append(-float("inf"))
+        values.append(float("nan"))
+        values.append(float("inf"))
+    elif dtype == pl.Int32:
+        iinfo = np.iinfo("int32")
+        values.append(iinfo.min)
+        values.append(iinfo.max)
+    return pl.LazyFrame(
+        {
+            "a": pl.Series(values, dtype=dtype),
+            "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32),
+        }
+    )
+
+
+def test_unary(ldf, op):
+    expr = getattr(pl.col("a"), op)()
+    q = ldf.select(expr)
+    assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("base_literal", [False, True])
+@pytest.mark.parametrize("exponent_literal", [False, True])
+def test_pow(ldf, base_literal, exponent_literal):
+    base = pl.lit(2) if base_literal else pl.col("a")
+    exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b")
+
+    q = ldf.select(base.pow(exponent))
+
+    assert_gpu_result_equal(q, check_exact=False)
+
+
+@pytest.mark.parametrize("natural", [True, False])
+def test_log(ldf, natural):
+    if natural:
+        expr = pl.col("a").log()
+    else:
+        expr = pl.col("a").log(10)
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index df08e15baa4..4f6850ac977 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -10,6 +10,7 @@
 
 from cudf_polars import execute_with_cudf
 from cudf_polars.testing.asserts import (
+    assert_collect_raises,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -152,3 +153,187 @@ def test_slice_column(slice_column_data):
     else:
         query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
     assert_ir_translation_raises(query, NotImplementedError)
+
+
+@pytest.fixture
+def to_datetime_data():
+    return pl.LazyFrame(
+        {
+            "a": [
+                "2021-01-01",
+                "2021-01-02",
+                "abcd",
+            ]
+        }
+    )
+
+
+@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}")
+@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}")
+@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}")
+@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}")
+def test_to_datetime(to_datetime_data, cache, strict, format, exact):
+    query = to_datetime_data.select(
+        pl.col("a").str.strptime(
+            pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact
+        )
+    )
+    if cache or format is None or not exact:
+        assert_ir_translation_raises(query, NotImplementedError)
+    elif strict:
+        assert_collect_raises(
+            query,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=pl.exceptions.ComputeError,
+        )
+    else:
+        assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "target, repl",
+    [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")],  # noqa: RUF001
+)
+@pytest.mark.parametrize("n", [0, 3, -1])
+def test_replace_literal(ldf, target, repl, n):
+    query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))])
+def test_replace_literal_unsupported(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace(target, repl, literal=True))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+def test_replace_re(ldf):
+    query = ldf.select(pl.col("a").str.replace("A", "a", literal=False))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "target,repl",
+    [
+        (["A", "de", "kLm", "awef"], "a"),
+        (["A", "de", "kLm", "awef"], ""),
+        (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]),
+        (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]),
+        (
+            pl.lit(pl.Series(["A", "de", "kLm", "awef"])),
+            pl.lit(pl.Series(["a", "b", "c", "d"])),
+        ),
+    ],
+)
+def test_replace_many(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace_many(target, repl))
+
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "target,repl",
+    [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())],
+)
+def test_replace_many_notimplemented(ldf, target, repl):
+    query = ldf.select(pl.col("a").str.replace_many(target, repl))
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+def test_replace_many_ascii_case(ldf):
+    query = ldf.select(
+        pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True)
+    )
+
+    assert_ir_translation_raises(query, NotImplementedError)
+
+
+_strip_data = [
+    "AbC",
+    "123abc",
+    "",
+    " ",
+    None,
+    "aAaaaAAaa",
+    " ab c ",
+    "abc123",
+    "    ",
+    "\tabc\t",
+    "\nabc\n",
+    "\r\nabc\r\n",
+    "\t\n abc \n\t",
+    "!@#$%^&*()",
+    "   abc!!!   ",
+    "   abc\t\n!!!   ",
+    "__abc__",
+    "abc\n\n",
+    "123abc456",
+    "abcxyzabc",
+]
+
+strip_chars = [
+    "a",
+    "",
+    " ",
+    "\t",
+    "\n",
+    "\r\n",
+    "!",
+    "@#",
+    "123",
+    "xyz",
+    "abc",
+    "__",
+    " \t\n",
+    "abc123",
+    None,
+]
+
+
+@pytest.fixture
+def strip_ldf():
+    return pl.DataFrame({"a": _strip_data}).lazy()
+
+
+@pytest.fixture(params=strip_chars)
+def to_strip(request):
+    return request.param
+
+
+def test_strip_chars(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_start(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_end(strip_ldf, to_strip):
+    q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip))
+    assert_gpu_result_equal(q)
+
+
+def test_strip_chars_column(strip_ldf):
+    q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a")))
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_invalid_regex_raises():
+    df = pl.LazyFrame({"a": ["abc"]})
+
+    q = df.select(pl.col("a").str.contains(r"ab)", strict=True))
+
+    assert_collect_raises(
+        q,
+        polars_except=pl.exceptions.ComputeError,
+        cudf_except=pl.exceptions.ComputeError,
+    )
+
+
+@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"])
+def test_unsupported_regex_raises(pattern):
+    df = pl.LazyFrame({"a": ["abc"]})
+
+    q = df.select(pl.col("a").str.contains(pattern, strict=True))
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 5b4bba55552..3c3986be19b 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -6,6 +6,9 @@
 import pytest
 
 import polars as pl
+from polars.testing.asserts import assert_frame_equal
+
+import rmm
 
 from cudf_polars.dsl.ir import IR
 from cudf_polars.testing.asserts import (
@@ -32,3 +35,48 @@ def raise_unimplemented(self):
     ):
         # And ensure that collecting issues the correct warning.
         assert_gpu_result_equal(q)
+
+
+def test_unsupported_config_raises():
+    q = pl.LazyFrame({})
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(unknown_key=True))
+
+
+@pytest.mark.parametrize("device", [-1, "foo"])
+def test_invalid_device_raises(device):
+    q = pl.LazyFrame({})
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(device=device))
+
+
+@pytest.mark.parametrize("mr", [1, object()])
+def test_invalid_memory_resource_raises(mr):
+    q = pl.LazyFrame({})
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.collect(engine=pl.GPUEngine(memory_resource=mr))
+
+
+def test_explicit_device_zero():
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+
+    result = q.collect(engine=pl.GPUEngine(device=0))
+    assert_frame_equal(q.collect(), result)
+
+
+def test_explicit_memory_resource():
+    upstream = rmm.mr.CudaMemoryResource()
+    n_allocations = 0
+
+    def allocate(bytes, stream):
+        nonlocal n_allocations
+        n_allocations += 1
+        return upstream.allocate(bytes, stream)
+
+    mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate)
+
+    q = pl.LazyFrame({"a": [1, 2, 3]})
+    result = q.collect(engine=pl.GPUEngine(memory_resource=mr))
+    assert_frame_equal(q.collect(), result)
+    assert n_allocations > 0
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index a75825ef3d3..6f996e0e0ec 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -12,7 +12,6 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils import versions
 
 
 @pytest.fixture
@@ -31,6 +30,7 @@ def df():
     params=[
         [pl.col("key1")],
         [pl.col("key2")],
+        [pl.col("key1"), pl.lit(1)],
         [pl.col("key1") * pl.col("key2")],
         [pl.col("key1"), pl.col("key2")],
         [pl.col("key1") == pl.col("key2")],
@@ -52,6 +52,7 @@ def keys(request):
         [(pl.col("float") - pl.lit(2)).max()],
         [pl.col("float").sum().round(decimals=1)],
         [pl.col("float").round(decimals=1).sum()],
+        [pl.col("int").first(), pl.col("float").last()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )
@@ -60,15 +61,7 @@ def exprs(request):
 
 
 @pytest.fixture(
-    params=[
-        False,
-        pytest.param(
-            True,
-            marks=pytest.mark.xfail(
-                reason="Maintaining order in groupby not implemented"
-            ),
-        ),
-    ],
+    params=[False, True],
     ids=["no_maintain_order", "maintain_order"],
 )
 def maintain_order(request):
@@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
     # Multiple keys don't do sorting
     qsorted = q.sort(*sort_keys)
     if len(keys) > 1:
-        with pytest.raises(AssertionError):
-            # https://github.com/pola-rs/polars/issues/17556
-            assert_gpu_result_equal(q, check_exact=False)
-        if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean():
-            # https://github.com/pola-rs/polars/issues/17557
-            with pytest.raises(AssertionError):
-                assert_gpu_result_equal(qsorted, check_exact=False)
-        else:
-            assert_gpu_result_equal(qsorted, check_exact=False)
+        # https://github.com/pola-rs/polars/issues/17556
+        # Can't assert that the query without post-sorting fails,
+        # since it _might_ pass.
+        assert_gpu_result_equal(qsorted, check_exact=False)
     elif schema[sort_keys[0]] == pl.Boolean():
         # Boolean keys don't do sorting, so we get random order
         assert_gpu_result_equal(qsorted, check_exact=False)
@@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
+def test_groupby_null_keys(maintain_order):
+    df = pl.LazyFrame(
+        {
+            "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()),
+            "value": [-1, 2, 1, 2, 3, 4],
+        }
+    )
+
+    q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min())
+    if not maintain_order:
+        q = q.sort("key")
+
+    assert_gpu_result_equal(q)
+
+
 @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
 def test_groupby_minmax_with_nan():
     df = pl.LazyFrame(
@@ -159,15 +162,7 @@ def test_groupby_nan_minmax_raises(op):
 
 @pytest.mark.parametrize(
     "key",
-    [
-        pytest.param(
-            1,
-            marks=pytest.mark.xfail(
-                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
-            ),
-        ),
-        pl.col("key1"),
-    ],
+    [1, pl.col("key1")],
 )
 @pytest.mark.parametrize(
     "expr",
@@ -183,3 +178,12 @@ def test_groupby_literal_in_agg(df, key, expr):
     # so just sort by the group key
     q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()],
+)
+def test_groupby_unary_non_pointwise_raises(df, expr):
+    q = df.group_by("key1").agg(expr)
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py
new file mode 100644
index 00000000000..38b3ce74ac5
--- /dev/null
+++ b/python/cudf_polars/tests/test_groupby_dynamic.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from datetime import datetime
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_ir_translation_raises
+
+
+def test_groupby_dynamic_raises():
+    df = pl.LazyFrame(
+        {
+            "dt": [
+                datetime(2021, 12, 31, 0, 0, 0),
+                datetime(2022, 1, 1, 0, 0, 1),
+                datetime(2022, 3, 31, 0, 0, 1),
+                datetime(2022, 4, 1, 0, 0, 1),
+            ]
+        }
+    )
+
+    q = (
+        df.sort("dt")
+        .group_by_dynamic("dt", every="1q")
+        .agg(pl.col("dt").count().alias("num_values"))
+    )
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 1e880cdc6de..7d9ec98db97 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -17,7 +17,7 @@ def join_nulls(request):
     return request.param
 
 
-@pytest.fixture(params=["inner", "left", "semi", "anti", "full"])
+@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"])
 def how(request):
     return request.param
 
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index 77032108e6f..e895f27f637 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -61,3 +61,48 @@ def test_rename_columns(mapping):
     q = df.rename(mapping)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]])
+@pytest.mark.parametrize("variable_name", [None, "names"])
+@pytest.mark.parametrize("value_name", [None, "unpivoted"])
+def test_unpivot(index, variable_name, value_name):
+    df = pl.LazyFrame(
+        {
+            "a": ["x", "y", "z"],
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+            "c": pl.Series([2, 4, 6], dtype=pl.Float32),
+            "d": ["a", "b", "c"],
+        }
+    )
+    q = df.unpivot(
+        ["c", "b"], index=index, variable_name=variable_name, value_name=value_name
+    )
+
+    assert_gpu_result_equal(q)
+
+
+def test_unpivot_defaults():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([11, 12, 13], dtype=pl.UInt16),
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+            "c": pl.Series([2, 4, 6], dtype=pl.Float32),
+            "d": ["a", "b", "c"],
+        }
+    )
+    q = df.unpivot(index="d")
+    assert_gpu_result_equal(q)
+
+
+def test_unpivot_unsupported_cast_raises():
+    df = pl.LazyFrame(
+        {
+            "a": ["x", "y", "z"],
+            "b": pl.Series([1, 3, 5], dtype=pl.Int16),
+        }
+    )
+
+    q = df.unpivot(["a", "b"])
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
index fd8453b77c4..0cda89474a8 100644
--- a/python/cudf_polars/tests/test_python_scan.py
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -8,7 +8,9 @@
 
 
 def test_python_scan():
-    def source(with_columns, predicate, nrows):
+    def source(with_columns, predicate, nrows, *batch_size):
+        # PythonScan interface changes between 1.3 and 1.4 to add an
+        # extra batch_size argument
         return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
 
     q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 64acbb076ed..792b136acd8 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -12,7 +12,6 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils import versions
 
 
 @pytest.fixture(
@@ -58,6 +57,22 @@ def mask(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        None,
+        (1, 1),
+    ],
+    ids=[
+        "no-slice",
+        "slice-second",
+    ],
+)
+def slice(request):
+    # For use in testing that we handle
+    # polars slice pushdown correctly
+    return request.param
+
+
 def make_source(df, path, format):
     """
     Writes the passed polars df to a file of
@@ -79,7 +94,9 @@ def make_source(df, path, format):
         ("parquet", pl.scan_parquet),
     ],
 )
-def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request):
+def test_scan(
+    tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request
+):
     name, offset = row_index
     make_source(df, tmp_path / "file", format)
     request.applymarker(
@@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r
         row_index_offset=offset,
         n_rows=n_rows,
     )
+    if slice is not None:
+        q = q.slice(*slice)
     if mask is not None:
         q = q.filter(mask)
     if columns is not None:
         q = q.select(*columns)
-    polars_collect_kwargs = {}
-    if versions.POLARS_VERSION_LT_12:
-        # https://github.com/pola-rs/polars/issues/17553
-        polars_collect_kwargs = {"projection_pushdown": False}
-    assert_gpu_result_equal(
-        q,
-        polars_collect_kwargs=polars_collect_kwargs,
-        # This doesn't work in polars < 1.2 since the row-index
-        # is in the wrong order in previous polars releases
-        check_column_order=versions.POLARS_VERSION_LT_12,
-    )
+    assert_gpu_result_equal(q)
+
+
+def test_negative_slice_pushdown_raises(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.parquet")
+    q = pl.scan_parquet(tmp_path / "df.parquet")
+    # Take the last row
+    q = q.slice(-1, 1)
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_scan_unsupported_raises(tmp_path):
@@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-@pytest.mark.xfail(
-    versions.POLARS_VERSION_LT_11,
-    reason="https://github.com/pola-rs/polars/issues/15730",
-)
 def test_scan_row_index_projected_out(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 
@@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
         ("test*.csv", False),
     ],
 )
-def test_scan_csv_multi(tmp_path, filename, glob):
+@pytest.mark.parametrize(
+    "nrows_skiprows",
+    [
+        (None, 0),
+        (1, 1),
+        (3, 0),
+        (4, 2),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows):
+    n_rows, skiprows = nrows_skiprows
     with (tmp_path / "test1.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test2.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test*.csv").open("w") as f:
-        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     os.chdir(tmp_path)
-    q = pl.scan_csv(filename, glob=glob)
+    q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows)
 
     assert_gpu_result_equal(q)
 
@@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path):
     make_source(df, tmp_path / "file", "ndjson")
     q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True)
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_parquet_nested_null_raises(tmp_path):
+    df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))})
+
+    df.write_parquet(tmp_path / "file.pq")
+
+    q = pl.scan_parquet(tmp_path / "file.pq")
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_parquet_only_row_index_raises(df, tmp_path):
+    make_source(df, tmp_path / "file", "parquet")
+    q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index")
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_hf_url_raises():
+    q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv")
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py
index ecc02efd967..cfa8e5ff9b9 100644
--- a/python/cudf_polars/tests/test_sort.py
+++ b/python/cudf_polars/tests/test_sort.py
@@ -13,10 +13,7 @@
     "sort_keys",
     [
         (pl.col("a"),),
-        pytest.param(
-            (pl.col("d").abs(),),
-            marks=pytest.mark.xfail(reason="abs not yet implemented"),
-        ),
+        (pl.col("d").abs(),),
         (pl.col("a"), pl.col("d")),
         (pl.col("b"),),
     ],
diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
index 5bc2fe1efb7..8e7f1a09d9b 100644
--- a/python/cudf_polars/tests/testing/test_asserts.py
+++ b/python/cudf_polars/tests/testing/test_asserts.py
@@ -7,7 +7,10 @@
 
 import polars as pl
 
+from cudf_polars.containers import DataFrame
+from cudf_polars.dsl.ir import Select
 from cudf_polars.testing.asserts import (
+    assert_collect_raises,
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
@@ -26,10 +29,62 @@ def test_translation_assert_raises():
     class E(Exception):
         pass
 
-    unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b"))
+    unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b"))
     # Unsupported query should raise NotImplementedError
     assert_ir_translation_raises(unsupported, NotImplementedError)
 
     with pytest.raises(AssertionError):
         # This should fail, because we can't translate this query, but it doesn't raise E.
         assert_ir_translation_raises(unsupported, E)
+
+
+def test_collect_assert_raises(monkeypatch):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    with pytest.raises(AssertionError):
+        # This should raise, because polars CPU can run this query
+        assert_collect_raises(
+            df,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=pl.exceptions.InvalidOperationError,
+        )
+
+    # Here's an invalid query that gets caught at IR optimisation time.
+    q = df.select(pl.col("a") * pl.col("b"))
+
+    # This exception is raised in preprocessing, so is the same for
+    # both CPU and GPU engines.
+    assert_collect_raises(
+        q,
+        polars_except=pl.exceptions.InvalidOperationError,
+        cudf_except=pl.exceptions.InvalidOperationError,
+    )
+
+    with pytest.raises(AssertionError):
+        # This should raise because the expected GPU error is wrong
+        assert_collect_raises(
+            q,
+            polars_except=pl.exceptions.InvalidOperationError,
+            cudf_except=NotImplementedError,
+        )
+
+    with pytest.raises(AssertionError):
+        # This should raise because the expected CPU error is wrong
+        assert_collect_raises(
+            q,
+            polars_except=NotImplementedError,
+            cudf_except=pl.exceptions.InvalidOperationError,
+        )
+
+    with monkeypatch.context() as m:
+        m.setattr(Select, "evaluate", lambda self, cache: DataFrame([]))
+        # This query should fail, but we monkeypatch a bad
+        # implementation of Select which "succeeds" to check that our
+        # assertion notices this case.
+        q = df.select(pl.col("a") + pl.Series([1, 2]))
+        with pytest.raises(AssertionError):
+            assert_collect_raises(
+                q,
+                polars_except=pl.exceptions.ComputeError,
+                cudf_except=pl.exceptions.ComputeError,
+            )
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 4655d2165f0..69e1524be39 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to
 ## Resources
 
 - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/)
 - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
 - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
 - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 0ddc68bcb9d..e8e0caaf42d 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -2,7 +2,19 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year
+from pylibcudf.libcudf.datetime cimport (
+    day_of_year as cpp_day_of_year,
+    extract_day as cpp_extract_day,
+    extract_hour as cpp_extract_hour,
+    extract_microsecond_fraction as cpp_extract_microsecond_fraction,
+    extract_millisecond_fraction as cpp_extract_millisecond_fraction,
+    extract_minute as cpp_extract_minute,
+    extract_month as cpp_extract_month,
+    extract_nanosecond_fraction as cpp_extract_nanosecond_fraction,
+    extract_second as cpp_extract_second,
+    extract_weekday as cpp_extract_weekday,
+    extract_year as cpp_extract_year,
+)
 
 from .column cimport Column
 
@@ -28,3 +40,42 @@ cpdef Column extract_year(
     with nogil:
         result = move(cpp_extract_year(values.view()))
     return Column.from_libcudf(move(result))
+
+
+def extract_datetime_component(Column col, str field):
+
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        if field == "year":
+            c_result = move(cpp_extract_year(col.view()))
+        elif field == "month":
+            c_result = move(cpp_extract_month(col.view()))
+        elif field == "day":
+            c_result = move(cpp_extract_day(col.view()))
+        elif field == "weekday":
+            c_result = move(cpp_extract_weekday(col.view()))
+        elif field == "hour":
+            c_result = move(cpp_extract_hour(col.view()))
+        elif field == "minute":
+            c_result = move(cpp_extract_minute(col.view()))
+        elif field == "second":
+            c_result = move(cpp_extract_second(col.view()))
+        elif field == "millisecond":
+            c_result = move(
+                cpp_extract_millisecond_fraction(col.view())
+            )
+        elif field == "microsecond":
+            c_result = move(
+                cpp_extract_microsecond_fraction(col.view())
+            )
+        elif field == "nanosecond":
+            c_result = move(
+                cpp_extract_nanosecond_fraction(col.view())
+            )
+        elif field == "day_of_year":
+            c_result = move(cpp_day_of_year(col.view()))
+        else:
+            raise ValueError(f"Invalid datetime field: '{field}'")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
index bd6e2e0af02..abf4357f862 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx regex_flags.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
index 3a89299f11a..019ff3f17ba 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 
 
 cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil:
 
-    ctypedef enum side_type:
+    cpdef enum class side_type(int32_t):
         LEFT 'cudf::strings::side_type::LEFT'
         RIGHT 'cudf::strings::side_type::RIGHT'
         BOTH 'cudf::strings::side_type::BOTH'
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index d3065cf8667..8b4fbb1932f 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,8 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx
-                   regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx
+set(cython_sources
+    capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx regex_flags.pyx
+    regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx
 )
 
 set(linked_libraries cudf::cudf)
@@ -22,3 +23,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
 )
+
+add_subdirectory(convert)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 6848c8e6e86..4867d944dc7 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,10 +5,13 @@ from . cimport (
     case,
     char_types,
     contains,
+    convert,
     extract,
     find,
     regex_flags,
     regex_program,
     replace,
     slice,
+    strip,
 )
+from .side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index bba86e818cc..a3bef64d19f 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,6 +5,7 @@
     case,
     char_types,
     contains,
+    convert,
     extract,
     find,
     regex_flags,
@@ -12,4 +13,6 @@
     repeat,
     replace,
     slice,
+    strip,
 )
+from .side_type import SideType
diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
new file mode 100644
index 00000000000..175c9b3738e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources convert_durations.pyx convert_datetime.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
new file mode 100644
index 00000000000..05324cb49df
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . cimport convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
new file mode 100644
index 00000000000..d803399d53c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from . import convert_datetime, convert_durations
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
new file mode 100644
index 00000000000..07c84d263d6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_timestamps(
+    Column input,
+    DataType timestamp_type,
+    const string& format
+)
+
+cpdef Column from_timestamps(
+    Column input,
+    const string& format,
+    Column input_strings_names
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
new file mode 100644
index 00000000000..fcacb096f87
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_datetime as cpp_convert_datetime,
+)
+
+from pylibcudf.types import DataType
+
+
+cpdef Column to_timestamps(
+    Column input,
+    DataType timestamp_type,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.to_timestamps(
+            input.view(),
+            timestamp_type.c_obj,
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_timestamps(
+    Column input,
+    const string& format,
+    Column input_strings_names
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.from_timestamps(
+            input.view(),
+            format,
+            input_strings_names.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column is_timestamp(
+    Column input,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_datetime.is_timestamp(
+            input.view(),
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
new file mode 100644
index 00000000000..ac11b8959ed
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from pylibcudf.column cimport Column
+from pylibcudf.types cimport DataType
+
+
+cpdef Column to_durations(
+    Column input,
+    DataType duration_type,
+    const string& format
+)
+
+cpdef Column from_durations(
+    Column input,
+    const string& format
+)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
new file mode 100644
index 00000000000..f3e0b7c9c8e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.convert cimport (
+    convert_durations as cpp_convert_durations,
+)
+
+from pylibcudf.types import DataType
+
+
+cpdef Column to_durations(
+    Column input,
+    DataType duration_type,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_durations.to_durations(
+            input.view(),
+            duration_type.c_obj,
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column from_durations(
+    Column input,
+    const string& format
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_convert_durations.from_durations(
+            input.view(),
+            format
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd
new file mode 100644
index 00000000000..34b7a580380
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.side_type cimport side_type
diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx
new file mode 100644
index 00000000000..acdc7d6ff1f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.side_type import \
+    side_type as SideType  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd
new file mode 100644
index 00000000000..8bbe4753edd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
+
+
+cpdef Column strip(
+    Column input,
+    side_type side=*,
+    Scalar to_strip=*
+)
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
new file mode 100644
index 00000000000..429a23c3cdf
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport strip as cpp_strip
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.side_type cimport side_type
+
+
+cpdef Column strip(
+    Column input,
+    side_type side=side_type.BOTH,
+    Scalar to_strip=None
+):
+    """Removes the specified characters from the beginning
+    or end (or both) of each string.
+
+    For details, see :cpp:func:`cudf::strings::strip`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column for this operation
+    side : SideType, default SideType.BOTH
+        Indicates characters are to be stripped from the beginning,
+        end, or both of each string; Default is both
+    to_strip : Scalar
+        UTF-8 encoded characters to strip from each string;
+        Default is empty string which indicates strip whitespace characters
+
+    Returns
+    -------
+    pylibcudf.Column
+        New strings column.
+    """
+
+    if to_strip is None:
+        to_strip = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef unique_ptr[column] c_result
+    cdef string_scalar* cpp_to_strip
+    cpp_to_strip = <string_scalar *>(to_strip.c_obj.get())
+
+    with nogil:
+        c_result = cpp_strip.strip(
+            input.view(),
+            side,
+            dereference(cpp_to_strip)
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index d3aa6101e2d..89c96829e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import datetime
+import functools
 
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -10,7 +11,7 @@
 
 
 @pytest.fixture
-def column(has_nulls):
+def date_column(has_nulls):
     values = [
         datetime.date(1999, 1, 1),
         datetime.date(2024, 10, 12),
@@ -22,9 +23,41 @@ def column(has_nulls):
     return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
 
 
-def test_extract_year(column):
-    got = plc.datetime.extract_year(column)
+@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
+def datetime_column(has_nulls, request):
+    values = [
+        datetime.datetime(1999, 1, 1),
+        datetime.datetime(2024, 10, 12),
+        datetime.datetime(1970, 1, 1),
+        datetime.datetime(2260, 1, 1),
+        datetime.datetime(2024, 2, 29, 3, 14, 15),
+        datetime.datetime(2024, 2, 29, 3, 14, 15, 999),
+    ]
+    if has_nulls:
+        values[2] = None
+    return plc.interop.from_arrow(
+        pa.array(values, type=pa.timestamp(request.param))
+    )
+
+
+@pytest.mark.parametrize(
+    "component, pc_fun",
+    [
+        ("year", pc.year),
+        ("month", pc.month),
+        ("day", pc.day),
+        ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)),
+        ("hour", pc.hour),
+        ("minute", pc.minute),
+        ("second", pc.second),
+        ("millisecond", pc.millisecond),
+        ("microsecond", pc.microsecond),
+        ("nanosecond", pc.nanosecond),
+    ],
+)
+def test_extraction(datetime_column, component, pc_fun):
+    got = plc.datetime.extract_datetime_component(datetime_column, component)
     # libcudf produces an int16, arrow produces an int64
-    expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16())
+    expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16())
 
     assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
new file mode 100644
index 00000000000..e9e95459d0e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from datetime import datetime
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        pa.timestamp("ns"),
+        pa.timestamp("us"),
+        pa.timestamp("ms"),
+        pa.timestamp("s"),
+    ],
+)
+def timestamp_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        pa.duration("ns"),
+        pa.duration("us"),
+        pa.duration("ms"),
+        pa.duration("s"),
+    ],
+)
+def duration_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def pa_timestamp_col():
+    return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"])
+
+
+@pytest.fixture(scope="module")
+def pa_duration_col():
+    return pa.array(["05:20:25"])
+
+
+@pytest.fixture(scope="module")
+def plc_timestamp_col(pa_timestamp_col):
+    return plc.interop.from_arrow(pa_timestamp_col)
+
+
+@pytest.fixture(scope="module")
+def plc_duration_col(pa_duration_col):
+    return plc.interop.from_arrow(pa_duration_col)
+
+
+@pytest.mark.parametrize("format", ["%Y-%m-%d"])
+def test_to_datetime(
+    pa_timestamp_col, plc_timestamp_col, timestamp_type, format
+):
+    expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit)
+    got = plc.strings.convert.convert_datetime.to_timestamps(
+        plc_timestamp_col,
+        plc.interop.from_arrow(timestamp_type),
+        format.encode(),
+    )
+    assert_column_eq(expect, got)
+
+
+@pytest.mark.parametrize("format", ["%H:%M:%S"])
+def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format):
+    def to_timedelta(duration_str):
+        date = datetime.strptime(duration_str, format)
+        return date - datetime(1900, 1, 1)  # "%H:%M:%S" zero date
+
+    expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast(
+        duration_type
+    )
+
+    got = plc.strings.convert.convert_durations.to_durations(
+        plc_duration_col,
+        plc.interop.from_arrow(duration_type),
+        format.encode(),
+    )
+    assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
new file mode 100644
index 00000000000..005e5e4a405
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+data_strings = [
+    "AbC",
+    "123abc",
+    "",
+    " ",
+    None,
+    "aAaaaAAaa",
+    " ab c ",
+    "abc123",
+    "    ",
+    "\tabc\t",
+    "\nabc\n",
+    "\r\nabc\r\n",
+    "\t\n abc \n\t",
+    "!@#$%^&*()",
+    "   abc!!!   ",
+    "   abc\t\n!!!   ",
+    "__abc__",
+    "abc\n\n",
+    "123abc456",
+    "abcxyzabc",
+]
+
+strip_chars = [
+    "a",
+    "",
+    " ",
+    "\t",
+    "\n",
+    "\r\n",
+    "!",
+    "@#",
+    "123",
+    "xyz",
+    "abc",
+    "__",
+    " \t\n",
+    "abc123",
+]
+
+
+@pytest.fixture
+def pa_col():
+    return pa.array(data_strings, type=pa.string())
+
+
+@pytest.fixture
+def plc_col(pa_col):
+    return plc.interop.from_arrow(pa_col)
+
+
+@pytest.fixture(params=strip_chars)
+def pa_char(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture
+def plc_char(pa_char):
+    return plc.interop.from_arrow(pa_char)
+
+
+def test_strip(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.strip()
+        return st.strip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char)
+    assert_column_eq(expected, got)
+
+
+def test_strip_right(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.rstrip()
+        return st.rstrip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(
+        plc_col, plc.strings.SideType.RIGHT, plc_char
+    )
+    assert_column_eq(expected, got)
+
+
+def test_strip_left(pa_col, plc_col, pa_char, plc_char):
+    def strip_string(st, char):
+        if st is None:
+            return None
+
+        elif char == "":
+            return st.lstrip()
+        return st.lstrip(char)
+
+    expected = pa.array(
+        [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()],
+        type=pa.string(),
+    )
+
+    got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char)
+    assert_column_eq(expected, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index 06fc35d8835..d5c618f07e4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans):
     got = input.with_mask(mask, null_count)
 
     assert_column_eq(expect, got)
+
+
+def test_bools_to_mask_roundtrip():
+    pa_array = pa.array([True, None, False])
+    plc_input = plc.interop.from_arrow(pa_array)
+    mask, result_null_count = plc.transform.bools_to_mask(plc_input)
+
+    assert result_null_count == 2
+    result = plc_input.with_mask(mask, result_null_count)
+    assert_column_eq(pa.array([True, None, None]), result)
+
+    plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array))
+    result_pa = plc.interop.to_arrow(plc_output)
+    expected_pa = pa.chunked_array([[True, False, False]])
+    assert result_pa.equals(expected_pa)
+
+
+def test_encode():
+    pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]})
+    plc_input = plc.interop.from_arrow(pa_table)
+    result_table, result_column = plc.transform.encode(plc_input)
+    pa_table_result = plc.interop.to_arrow(result_table)
+    pa_column_result = plc.interop.to_arrow(result_column)
+
+    pa_table_expected = pa.table(
+        [[1, 3, 4], [1, 2, 4]],
+        schema=pa.schema(
+            [
+                pa.field("", pa.int64(), nullable=False),
+                pa.field("", pa.int64(), nullable=False),
+            ]
+        ),
+    )
+    assert pa_table_result.equals(pa_table_expected)
+
+    pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32())
+    assert pa_column_result.equals(pa_column_expected)
+
+
+def test_one_hot_encode():
+    pa_column = pa.array([1, 2, 3])
+    pa_categories = pa.array([0, 0, 0])
+    plc_input = plc.interop.from_arrow(pa_column)
+    plc_categories = plc.interop.from_arrow(pa_categories)
+    plc_table = plc.transform.one_hot_encode(plc_input, plc_categories)
+    result = plc.interop.to_arrow(plc_table)
+    expected = pa.table(
+        [[False] * 3] * 3,
+        schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3),
+    )
+    assert result.equals(expected)
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index 4b21feffe25..b530f433c97 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from pylibcudf.libcudf.types cimport bitmask_type, data_type
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .table cimport Table
+from .types cimport DataType
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input)
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit)
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx)
+
+cpdef tuple[Table, Column] encode(Table input)
+
+cpdef Table one_hot_encode(Column input_column, Column categories)
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index 100ccb580ce..bcd6185521a 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -1,14 +1,20 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 from libcpp.utility cimport move, pair
 from pylibcudf.libcudf cimport transform as cpp_transform
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .types cimport DataType
+from .utils cimport int_to_bitmask_ptr
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
@@ -32,3 +38,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
         gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
         c_result.second
     )
+
+
+cpdef tuple[gpumemoryview, int] bools_to_mask(Column input):
+    """Create a bitmask from a column of boolean elements
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    tuple[gpumemoryview, int]
+        Two-tuple of a gpumemoryview wrapping the bitmask and the null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.bools_to_mask(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
+
+
+cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit):
+    """Creates a boolean column from given bitmask.
+
+    Parameters
+    ----------
+    bitmask : int
+        Pointer to the bitmask which needs to be converted
+    begin_bit : int
+        Position of the bit from which the conversion should start
+    end_bit : int
+        Position of the bit before which the conversion should stop
+
+    Returns
+    -------
+    Column
+        Boolean column of the bitmask from [begin_bit, end_bit]
+    """
+    cdef unique_ptr[column] c_result
+    cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask)
+
+    with nogil:
+        c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx):
+    """Create a new column by applying a unary function against every
+       element of an input column.
+
+    Parameters
+    ----------
+    input : Column
+        Column to transform.
+    unary_udf : str
+        The PTX/CUDA string of the unary function to apply.
+    output_type : DataType
+        The output type that is compatible with the output type in the unary_udf.
+    is_ptx : bool
+        If `True`, the UDF is treated as PTX code.
+        If `False`, the UDF is treated as CUDA code.
+
+    Returns
+    -------
+    Column
+        The transformed column having the UDF applied to each element.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_unary_udf = unary_udf.encode()
+    cdef bool c_is_ptx = is_ptx
+
+    with nogil:
+        c_result = move(
+            cpp_transform.transform(
+                input.view(), c_unary_udf, output_type.c_obj, c_is_ptx
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef tuple[Table, Column] encode(Table input):
+    """Encode the rows of the given table as integers.
+
+    Parameters
+    ----------
+    input : Table
+        Table containing values to be encoded
+
+    Returns
+    -------
+    tuple[Table, Column]
+        The distinct row of the input table in sorted order,
+        and a column of integer indices representing the encoded rows.
+    """
+    cdef pair[unique_ptr[table], unique_ptr[column]] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.encode(input.view()))
+
+    return (
+        Table.from_libcudf(move(c_result.first)),
+        Column.from_libcudf(move(c_result.second))
+    )
+
+cpdef Table one_hot_encode(Column input, Column categories):
+    """Encodes `input` by generating a new column
+    for each value in `categories` indicating the presence
+    of that value in `input`.
+
+    Parameters
+    ----------
+    input : Column
+        Column containing values to be encoded.
+    categories : Column
+        Column containing categories
+
+    Returns
+    -------
+    Column
+        A table of the encoded values.
+    """
+    cdef pair[unique_ptr[column], table_view] c_result
+    cdef Table owner_table
+
+    with nogil:
+        c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view()))
+
+    owner_table = Table(
+        [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns()
+    )
+
+    return Table.from_table_view(c_result.second, owner_table)