diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8976a4b14cb..2c79cbb6b6c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -40,7 +40,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==12.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -69,7 +69,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==12.0.1.*
+- pyarrow==12.0.0.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index f54d78593c3..c96b7428882 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,6 +17,7 @@ dependencies:
 - cachetools
 - cmake>=3.26.4
 - cuda-cudart-dev
+- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
@@ -41,7 +42,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==12.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -66,7 +67,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==12.0.1.*
+- pyarrow==12.0.0.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 54b687faa69..16b064a262e 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =12
+    - pyarrow =12.0.0
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index 5d8720f1c98..f4bb6e1bc91 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,4 +1,16 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
+# Need to set CUDA_HOME inside conda environments because the hacked together
+# setup.py for cudf-kafka searches that way.
+# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+# cudf_kafka to use scikit-build
+CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
+if [[ ${CUDA_MAJOR} == "12" ]]; then
+    target_name="x86_64-linux"
+    if [[ ! $(arch) == "x86_64" ]]; then
+        target_name="sbsa-linux"
+    fi
+    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
+fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index ec0cc402511..a79c23b7d98 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,6 +33,9 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
+    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+    # cudf_kafka to use scikit-build
+    - RAPIDS_CUDA_VERSION
 
 requirements:
   build:
@@ -41,6 +44,11 @@ requirements:
     - {{ compiler('cxx') }}
     - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
+    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+    # cudf_kafka to use scikit-build
+    {% if cuda_major == "12" %}
+    - cuda-gdb
+    {% endif %}
   host:
     - python
     - cython >=3.0.0
@@ -48,6 +56,9 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - setuptools
+    {% if cuda_major == "12" %}
+    - cuda-cudart-dev
+    {% endif %}
   run:
     - python
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index b1f5b083e06..4d33bb89220 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -25,8 +25,8 @@ gtest_version:
 aws_sdk_cpp_version:
   - "<1.11"
 
-libarrow_version:
-  - "=12"
+libarrow:
+  - "==12.0.0"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 28357f0d96d..b9aff2a9c82 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
-    - libarrow {{ libarrow_version }}
+    - libarrow {{ libarrow }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
@@ -104,7 +104,7 @@ outputs:
         - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
-        - libarrow {{ libarrow_version }}
+        - libarrow {{ libarrow }}
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 10d3145a36f..c2d5cfbaf78 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -411,7 +411,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      12.0.1
+      12.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index f04b2fda2bf..ff148c59a23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,9 +20,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/lists/list_device_view.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -130,8 +130,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(offsets_column));
   children.emplace_back(std::move(child_column));
-  auto null_mask =
-    target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
+  auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr)
+                                      : rmm::device_buffer{0, stream, mr};
 
   // The output column from this function only has null masks copied from the target columns.
   // That is still not a correct final null mask for the scatter result.
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 672f479ad53..524296e60ca 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits);
  *
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
@@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
@@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
  * @param mask Bitmask residing in device memory whose bits will be copied
  * @param begin_bit Index of the first bit to be copied (inclusive)
  * @param end_bit Index of the last bit to be copied (exclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
@@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask(
  * Returns empty `device_buffer` if the column is not nullable
  *
  * @param view Column view whose bitmask needs to be copied
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -163,11 +177,13 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -183,8 +199,12 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param bitmask Validity bitmask residing in device memory.
  * @param start Index of the first bit to count (inclusive).
  * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The number of null elements in the specified range.
  */
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop);
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 6b413ab2be4..53b04c4ca80 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -366,7 +366,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");
 
-  auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
+  auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out =
     make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 5a0d3e4f120..3ff56eabe1e 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask,
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::create_null_mask(size, state, cudf::get_default_stream(), mr);
+  return detail::create_null_mask(size, state, stream, mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream)
 {
-  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream());
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream);
 }
 
 namespace detail {
@@ -511,33 +516,46 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_and(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_and(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_or(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_or(view, stream, mr);
 }
 
 // Count non-zero bits in the specified range
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop)
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream)
 {
-  return detail::null_count(bitmask, start, stop, cudf::get_default_stream());
+  CUDF_FUNC_RANGE();
+  return detail::null_count(bitmask, start, stop, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d08c3025553..9b9e780965a 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
       });
 
     rmm::device_buffer null_mask =
-      create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
+      cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr);
 
     detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), stream);
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 11c27fc86e3..879ddb5048e 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     // Compute null mask
     rmm::device_buffer null_mask =
-      target.nullable() ? copy_bitmask(target, stream, mr)
-                        : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
+      target.nullable()
+        ? detail::copy_bitmask(target, stream, mr)
+        : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
     column null_mask_stub(data_type{type_id::STRUCT},
                           target.size(),
                           rmm::device_buffer{},
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 506832881a9..195c8924c9a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
-  auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 37c99453361..42036b79751 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -753,7 +753,7 @@ class TranslationOp {
                                              RelativeOffsetT const relative_offset,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, relative_offset, read_symbol);
+    return translation_op(state_id, match_id, relative_offset, read_symbol);
   }
 
   template <typename StateIndexT, typename SymbolIndexT, typename SymbolT>
@@ -761,7 +761,7 @@ class TranslationOp {
                                              SymbolIndexT const match_id,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, read_symbol);
+    return translation_op(state_id, match_id, read_symbol);
   }
 };
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index c9107357239..3702d94fd2b 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -91,6 +91,98 @@ void check_input_size(std::size_t input_size)
 
 namespace cudf::io::json {
 
+// FST to help fixing the stack context of characters that follow the first record on each JSON line
+namespace fix_stack_of_excess_chars {
+
+// Type used to represent the target state in the transition table
+using StateT = char;
+
+// Type used to represent a symbol group id
+using SymbolGroupT = uint8_t;
+
+/**
+ * @brief Definition of the DFA's states.
+ */
+enum class dfa_states : StateT {
+  // Before the first record on the JSON line
+  BEFORE,
+  // Within the first record on the JSON line
+  WITHIN,
+  // Excess data that follows the first record on the JSON line
+  EXCESS,
+  // Total number of states
+  NUM_STATES
+};
+
+/**
+ * @brief Definition of the symbol groups
+ */
+enum class dfa_symbol_group_id : SymbolGroupT {
+  ROOT,              ///< Symbol for root stack context
+  DELIMITER,         ///< Line delimiter symbol group
+  OTHER,             ///< Symbol group that implicitly matches all other tokens
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::NUM_STATES);
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+/**
+ * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
+ */
+struct SymbolPairToSymbolGroupId {
+  CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
+  {
+    auto const input_symbol = thrust::get<0>(symbol);
+    auto const stack_symbol = thrust::get<1>(symbol);
+    return static_cast<SymbolGroupT>(
+      input_symbol == '\n'
+        ? dfa_symbol_group_id::DELIMITER
+        : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
+  }
+};
+
+/**
+ * @brief Translation function object that fixes the stack context of excess data that follows after
+ * the first JSON record on each line.
+ */
+struct TransduceInputOp {
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StateT const state_id,
+                                                     SymbolGroupT const match_id,
+                                                     RelativeOffsetT const relative_offset,
+                                                     SymbolT const read_symbol) const
+  {
+    if (state_id == static_cast<StateT>(dfa_states::EXCESS)) { return '_'; }
+    return thrust::get<1>(read_symbol);
+  }
+
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                SymbolT const read_symbol) const
+  {
+    constexpr int32_t single_output_item = 1;
+    return single_output_item;
+  }
+};
+
+// Aliases for readability of the transition table
+constexpr auto TT_BEFORE = dfa_states::BEFORE;
+constexpr auto TT_INSIDE = dfa_states::WITHIN;
+constexpr auto TT_EXCESS = dfa_states::EXCESS;
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> constexpr transition_table{
+  {/* IN_STATE            ROOT      NEWLINE     OTHER */
+   /* TT_BEFORE    */ {{TT_BEFORE, TT_BEFORE, TT_INSIDE}},
+   /* TT_INSIDE    */ {{TT_EXCESS, TT_BEFORE, TT_INSIDE}},
+   /* TT_EXCESS    */ {{TT_EXCESS, TT_BEFORE, TT_EXCESS}}}};
+
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(dfa_states::BEFORE);
+}  // namespace fix_stack_of_excess_chars
+
 // FST to prune tokens of invalid lines for recovering JSON lines format
 namespace token_filter {
 
@@ -146,9 +238,8 @@ struct UnwrapTokenFromSymbolOp {
  * invalid lines.
  */
 struct TransduceToken {
-  template <typename TransducerTableT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 RelativeOffsetT const relative_offset,
                                                 SymbolT const read_symbol) const
@@ -165,9 +256,8 @@ struct TransduceToken {
     }
   }
 
-  template <typename TransducerTableT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 SymbolT const read_symbol) const
   {
@@ -643,6 +733,11 @@ auto get_transition_table(json_format_cfg_t format)
     // PD_ANL describes the target state after a new line after encountering error state
     auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR;
 
+    // Target state after having parsed the first JSON value on a JSON line
+    // Spark has the special need to ignore everything that comes after the first JSON object
+    // on a JSON line instead of marking those as invalid
+    auto const PD_AFS = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_PVL : PD_ERR;
+
     // First row:  empty stack         ("root" level of the JSON)
     // Second row: '[' on top of stack (we're parsing a list value)
     // Third row:  '{' on top of stack (we're parsing a struct value)
@@ -668,7 +763,7 @@ auto get_transition_table(json_format_cfg_t format)
       PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
       PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
+      PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_PVL, PD_BOV, PD_AFS,
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
@@ -733,6 +828,18 @@ auto get_translation_table(bool recover_from_error)
     return regular_tokens;
   };
 
+  /**
+   * @brief Helper function that returns `recovering_tokens` if `recover_from_error` is true and
+   * returns `regular_tokens` otherwise. This is used to ignore excess characters after the first
+   * value in the case of JSON lines that recover from invalid lines, as Spark ignores any excess
+   * characters that follow the first record on a JSON line.
+   */
+  auto alt_tokens = [recover_from_error](std::vector<char> regular_tokens,
+                                         std::vector<char> recovering_tokens) {
+    if (recover_from_error) { return recovering_tokens; }
+    return regular_tokens;
+  };
+
   std::array<std::array<std::vector<char>, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt;
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                    /*ROOT*/
                                                         {StructBegin},      // OPENING_BRACE
@@ -920,18 +1027,18 @@ auto get_translation_table(bool recover_from_error)
                                                         {}}};                         // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    {                    /*ROOT*/
-     {ErrorBegin},       // OPENING_BRACE
-     {ErrorBegin},       // OPENING_BRACKET
-     {ErrorBegin},       // CLOSING_BRACE
-     {ErrorBegin},       // CLOSING_BRACKET
-     {ErrorBegin},       // QUOTE
-     {ErrorBegin},       // ESCAPE
-     {ErrorBegin},       // COMMA
-     {ErrorBegin},       // COLON
-     {},                 // WHITE_SPACE
-     nl_tokens({}, {}),  // LINE_BREAK
-     {ErrorBegin},       // OTHER
+    {                                 /*ROOT*/
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // QUOTE
+     {alt_tokens({ErrorBegin}, {})},  // ESCAPE
+     {alt_tokens({ErrorBegin}, {})},  // COMMA
+     {alt_tokens({ErrorBegin}, {})},  // COLON
+     {},                              // WHITE_SPACE
+     nl_tokens({}, {}),               // LINE_BREAK
+     {alt_tokens({ErrorBegin}, {})},  // OTHER
      /*LIST*/
      {ErrorBegin},                 // OPENING_BRACE
      {ErrorBegin},                 // OPENING_BRACKET
@@ -1446,6 +1553,26 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // character.
   auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data());
 
+  // Spark, as the main stakeholder in the `recover_from_error` option, has the specific need to
+  // ignore any characters that follow the first value on each JSON line. This is an FST that
+  // fixes the stack context for those excess characters. That is, that all those excess characters
+  // will be interpreted in the root stack context
+  if (recover_from_error) {
+    auto fix_stack_of_excess_chars = fst::detail::make_fst(
+      fst::detail::make_symbol_group_lookup_op(
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+      fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
+      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      stream);
+    fix_stack_of_excess_chars.Transduce(zip_in,
+                                        static_cast<SymbolOffsetT>(json_in.size()),
+                                        stack_symbols.data(),
+                                        thrust::make_discard_iterator(),
+                                        thrust::make_discard_iterator(),
+                                        fix_stack_of_excess_chars::start_state,
+                                        stream);
+  }
+
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index a513e6674b4..e3b23f4c0a0 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -46,12 +46,6 @@ namespace cudf::io::parquet::detail {
 // encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
 // lengths, followed by the concatenated suffix data.
 
-// TODO: The delta encodings use ULEB128 integers, but for now we're only
-// using max 64 bits. Need to see what the performance impact is of using
-// __int128_t rather than int64_t.
-using uleb128_t   = uint64_t;
-using zigzag128_t = int64_t;
-
 // we decode one mini-block at a time. max mini-block size seen is 64.
 constexpr int delta_rolling_buf_size = 128;
 
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
new file mode 100644
index 00000000000..28f8cdfe2c1
--- /dev/null
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf::io::parquet::detail {
+
+namespace delta {
+
+inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v)
+{
+  while (v > 0x7f) {
+    *(p++) = v | 0x80;
+    v >>= 7;
+  }
+  *(p++) = v;
+}
+
+inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v)
+{
+  zigzag128_t s = (v < 0);
+  put_uleb128(p, (v ^ -s) * 2 + s);
+}
+
+// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming
+// too much shared memory.
+// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block
+// to be a multiple of 32.
+constexpr int block_size            = 128;
+constexpr int num_mini_blocks       = 4;
+constexpr int values_per_mini_block = block_size / num_mini_blocks;
+constexpr int buffer_size           = 2 * block_size;
+
+// An extra sanity checks to enforce compliance with the parquet specification.
+static_assert(block_size % 128 == 0);
+static_assert(values_per_mini_block % 32 == 0);
+
+using block_reduce = cub::BlockReduce<zigzag128_t, block_size>;
+using warp_reduce  = cub::WarpReduce<uleb128_t>;
+using index_scan   = cub::BlockScan<size_type, block_size>;
+
+constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+
+// Version of bit packer that can handle up to 64 bits values.
+// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
+// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a
+// measurable impact to using the wider types.
+template <typename scratch_type>
+inline __device__ void bitpack_mini_block(
+  uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
+{
+  using wide_type =
+    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using cudf::detail::warp_size;
+  scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
+  auto constexpr div          = sizeof(scratch_type) * 8;
+
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+
+  auto const scratch = reinterpret_cast<scratch_type*>(temp_space) + warp_id * warp_size;
+
+  // zero out scratch
+  scratch[lane_id] = 0;
+  __syncwarp();
+
+  // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...)
+  // like what's done for the RLE encoder.
+  if (nbits == div) {
+    if (lane_id < count) {
+      for (int i = 0; i < sizeof(scratch_type); i++) {
+        dst[lane_id * sizeof(scratch_type) + i] = val & 0xff;
+        val >>= 8;
+      }
+    }
+    return;
+  }
+
+  if (lane_id <= count) {
+    // Shift symbol left by up to mask bits.
+    wide_type v2 = val;
+    v2 <<= (lane_id * nbits) & mask;
+
+    // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules.
+    scratch_type v1[2];
+    memcpy(&v1, &v2, sizeof(wide_type));
+
+    // Atomically write result to scratch.
+    if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); }
+    if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); }
+  }
+  __syncwarp();
+
+  // Copy scratch data to final destination.
+  auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U);
+  auto const scratch_bytes   = reinterpret_cast<uint8_t const*>(scratch);
+
+  for (uint32_t i = lane_id; i < available_bytes; i += warp_size) {
+    dst[i] = scratch_bytes[i];
+  }
+  __syncwarp();
+}
+
+}  // namespace delta
+
+// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input
+// 128 values with validity at a time, saving them until there are enough values for a block
+// to be written.
+// T is the input data type (either zigzag128_t or uleb128_t).
+template <typename T>
+class delta_binary_packer {
+ private:
+  uint8_t* _dst;                             // sink to dump encoded values to
+  T* _buffer;                                // buffer to store values to be encoded
+  size_type _current_idx;                    // index of first value in buffer
+  uint32_t _num_values;                      // total number of values to encode
+  size_type _values_in_buffer;               // current number of values stored in _buffer
+  uint8_t _mb_bits[delta::num_mini_blocks];  // bitwidth for each mini-block
+
+  // pointers to shared scratch memory for the warp and block scans/reduces
+  delta::index_scan::TempStorage* _scan_tmp;
+  delta::warp_reduce::TempStorage* _warp_tmp;
+  delta::block_reduce::TempStorage* _block_tmp;
+
+  void* _bitpack_tmp;  // pointer to shared scratch memory used in bitpacking
+
+  // Write the delta binary header. Only call from thread 0.
+  inline __device__ void write_header()
+  {
+    delta::put_uleb128(_dst, delta::block_size);
+    delta::put_uleb128(_dst, delta::num_mini_blocks);
+    delta::put_uleb128(_dst, _num_values);
+    delta::put_zz128(_dst, _buffer[0]);
+  }
+
+  // Write the block header. Only call from thread 0.
+  inline __device__ void write_block_header(zigzag128_t block_min)
+  {
+    delta::put_zz128(_dst, block_min);
+    memcpy(_dst, _mb_bits, 4);
+    _dst += 4;
+  }
+
+  // Signed subtraction with defined wrapping behavior.
+  inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b)
+  {
+    return static_cast<zigzag128_t>(static_cast<uleb128_t>(a) - static_cast<uleb128_t>(b));
+  }
+
+ public:
+  inline __device__ auto num_values() const { return _num_values; }
+
+  // Initialize the object. Only call from thread 0.
+  inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage)
+  {
+    _dst              = dest;
+    _num_values       = num_values;
+    _buffer           = buffer;
+    _scan_tmp         = reinterpret_cast<delta::index_scan::TempStorage*>(temp_storage);
+    _warp_tmp         = reinterpret_cast<delta::warp_reduce::TempStorage*>(temp_storage);
+    _block_tmp        = reinterpret_cast<delta::block_reduce::TempStorage*>(temp_storage);
+    _bitpack_tmp      = _buffer + delta::buffer_size;
+    _current_idx      = 0;
+    _values_in_buffer = 0;
+  }
+
+  // Each thread calls this to add its current value.
+  inline __device__ void add_value(T value, bool is_valid)
+  {
+    // Figure out the correct position for the given value.
+    size_type const valid = is_valid;
+    size_type pos;
+    size_type num_valid;
+    delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid);
+
+    if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      _values_in_buffer += num_valid;
+      // if first pass write header
+      if (_current_idx == 0) {
+        write_header();
+        _current_idx = 1;
+        _values_in_buffer -= 1;
+      }
+    }
+    __syncthreads();
+
+    if (_values_in_buffer >= delta::block_size) { flush(); }
+  }
+
+  // Called by each thread to flush data to the sink.
+  inline __device__ uint8_t const* flush()
+  {
+    using cudf::detail::warp_size;
+    __shared__ zigzag128_t block_min;
+
+    int const t       = threadIdx.x;
+    int const warp_id = t / warp_size;
+    int const lane_id = t % warp_size;
+
+    if (_values_in_buffer <= 0) { return _dst; }
+
+    // Calculate delta for this thread.
+    size_type const idx     = _current_idx + t;
+    zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
+                                                           _buffer[delta::rolling_idx(idx - 1)])
+                                                : std::numeric_limits<zigzag128_t>::max();
+
+    // Find min delta for the block.
+    auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min());
+
+    if (t == 0) { block_min = min_delta; }
+    __syncthreads();
+
+    // Compute frame of reference for the block.
+    uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0;
+
+    // Get max normalized delta for each warp, and use that to determine how many bits to use
+    // for the bitpacking of this warp.
+    zigzag128_t const warp_max =
+      delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max());
+    __syncwarp();
+
+    if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); }
+    __syncthreads();
+
+    // write block header
+    if (t == 0) { write_block_header(block_min); }
+    __syncthreads();
+
+    // Now each warp encodes its data...can calculate starting offset with _mb_bits.
+    // NOTE: using a switch here rather than a loop because the compiler produces code that
+    // uses fewer registers.
+    int cumulative_bits = 0;
+    switch (warp_id) {
+      case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]];
+      case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]];
+      case 1: cumulative_bits += _mb_bits[0];
+    }
+    uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8;
+
+    // encoding happens here
+    auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block;
+    if (warp_idx < _num_values) {
+      auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx);
+      if (_mb_bits[warp_id] > 32) {
+        delta::bitpack_mini_block<unsigned long long>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      } else {
+        delta::bitpack_mini_block<uint32_t>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      }
+    }
+    __syncthreads();
+
+    // Last warp updates global delta ptr.
+    if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) {
+      _dst              = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8;
+      _current_idx      = min(warp_idx + delta::values_per_mini_block, _num_values);
+      _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U);
+    }
+    __syncthreads();
+
+    return _dst;
+  }
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 78873d5e8ca..1e4f061d2e0 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "delta_enc.cuh"
 #include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
@@ -21,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -41,13 +43,19 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
+#include <bitset>
+
 namespace cudf::io::parquet::detail {
 
 namespace {
 
 using ::cudf::detail::device_2dspan;
 
-constexpr uint32_t rle_buffer_size = (1 << 9);
+constexpr int encode_block_size = 128;
+constexpr int rle_buffer_size   = 2 * encode_block_size;
+constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
+
+constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // do not truncate statistics
 constexpr int32_t NO_TRUNC_STATS = 0;
@@ -69,6 +77,7 @@ struct frag_init_state_s {
   PageFragment frag;
 };
 
+template <int rle_buf_size>
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
@@ -81,14 +90,15 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[4];
-  volatile uint32_t scratch_red[32];
+  volatile uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  uint32_t vals[rle_buffer_size];
+  uint32_t vals[rle_buf_size];
 };
 
+using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
+
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
@@ -205,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+/**
+ * @brief Determine the correct page encoding for the given page parameters.
+ *
+ * This is only used by the plain and dictionary encoders. Delta encoders will set the page
+ * encoding directly.
+ */
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
@@ -216,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type,
   switch (page_type) {
     case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
     case PageType::DATA_PAGE_V2:
-      // TODO need to work in delta encodings here when they're added
       return physical_type == BOOLEAN ? Encoding::RLE
              : use_dictionary         ? Encoding::RLE_DICTIONARY
                                       : Encoding::PLAIN;
@@ -236,6 +251,50 @@ struct BitwiseOr {
   }
 };
 
+// I is the column type from the input table
+template <typename I>
+__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s,
+                                       uint32_t valid_count,
+                                       uint64_t* buffer,
+                                       void* temp_space)
+{
+  using output_type = std::conditional_t<std::is_signed_v<I>, zigzag128_t, uleb128_t>;
+  __shared__ delta_binary_packer<output_type> packer;
+
+  auto const t = threadIdx.x;
+  if (t == 0) {
+    packer.init(s->cur, valid_count, reinterpret_cast<output_type*>(buffer), temp_space);
+  }
+  __syncthreads();
+
+  // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64.
+  // might need to modify this if there's a big performance hit in the 32-bit case.
+  int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size);
+
+    size_type const val_idx_in_block = cur_val_idx + t;
+    size_type const val_idx          = s->page_start_val + val_idx_in_block;
+
+    bool const is_valid =
+      (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values)
+        ? s->col.leaf_column->is_valid(val_idx)
+        : false;
+
+    cur_val_idx += nvals;
+
+    output_type v = s->col.leaf_column->element<I>(val_idx);
+    if (scale < 0) {
+      v /= -scale;
+    } else {
+      v *= scale;
+    }
+    packer.add_value(v, is_valid);
+  }
+
+  return packer.flush();
+}
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -323,6 +382,29 @@ __global__ void __launch_bounds__(128)
   }
 }
 
+__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values)
+{
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len     = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  auto const vals_per_block = delta::block_size;
+  size_t const num_blocks   = util::div_rounding_up_unsafe(num_values, vals_per_block);
+  // need max dtype_len + 1 bytes for min_delta
+  // one byte per mini block for the bitwidth
+  // and block_size * dtype_len bytes for the actual encoded data
+  auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len;
+
+  // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks,
+  // max 5 bytes for number of values, and max dtype_len + 1 for first value.
+  auto const header_size = 2 + 1 + 5 + dtype_len + 1;
+
+  return header_size + num_blocks * block_size;
+}
+
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
@@ -354,6 +436,14 @@ __global__ void __launch_bounds__(128)
     page_g = {};
   }
   __syncthreads();
+
+  // if writing delta encoded values, we're going to need to know the data length to get a guess
+  // at the worst case number of bytes needed to encode.
+  auto const physical_type = col_g.physical_type;
+  auto const type_id       = col_g.leaf_column->type().id();
+  auto const is_use_delta =
+    write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64);
+
   if (t < 32) {
     uint32_t fragments_in_chunk  = 0;
     uint32_t rows_in_page        = 0;
@@ -403,9 +493,12 @@ __global__ void __launch_bounds__(128)
       }
       __syncwarp();
       if (t == 0) {
-        if (not pages.empty()) pages[ck_g.first_page] = page_g;
-        if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size;
-        if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g;
+        if (not pages.empty()) {
+          page_g.kernel_mask     = encode_kernel_mask::PLAIN;
+          pages[ck_g.first_page] = page_g;
+        }
+        if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; }
+        if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; }
       }
       num_pages = 1;
     }
@@ -505,7 +598,12 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
+          // get a different bound if using delta encoding
+          if (is_use_delta) {
+            page_size =
+              max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values));
+          }
+          auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -525,7 +623,16 @@ __global__ void __launch_bounds__(128)
         }
         __syncwarp();
         if (t == 0) {
-          if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; }
+          if (not pages.empty()) {
+            if (is_use_delta) {
+              page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY;
+            } else if (ck_g.use_dictionary || physical_type == BOOLEAN) {
+              page_g.kernel_mask = encode_kernel_mask::DICTIONARY;
+            } else {
+              page_g.kernel_mask = encode_kernel_mask::PLAIN;
+            }
+            pages[ck_g.first_page + num_pages] = page_g;
+          }
           if (not page_sizes.empty()) {
             page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size;
           }
@@ -789,8 +896,12 @@ inline __device__ void PackLiterals(
  * @param[in] t thread id (0..127)
  */
 static __device__ void RleEncode(
-  page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
+  rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
 {
+  using cudf::detail::warp_size;
+  auto const lane_id = t % warp_size;
+  auto const warp_id = t / warp_size;
+
   uint32_t rle_pos = s->rle_pos;
   uint32_t rle_run = s->rle_run;
 
@@ -798,20 +909,20 @@ static __device__ void RleEncode(
     uint32_t pos = rle_pos + t;
     if (rle_run > 0 && !(rle_run & 1)) {
       // Currently in a long repeat run
-      uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val);
+      uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val);
       uint32_t rle_rpt_count, max_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu);
-        if (!t) {
+        if (t == 0) {
           uint32_t last_idx = __ffs(c32) - 1;
           s->rle_rpt_count =
-            last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
+            last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
         }
       }
       __syncthreads();
-      max_rpt_count = min(numvals - rle_pos, 128);
+      max_rpt_count = min(numvals - rle_pos, encode_block_size);
       rle_rpt_count = s->rle_rpt_count;
       rle_run += rle_rpt_count << 1;
       rle_pos += rle_rpt_count;
@@ -828,17 +939,17 @@ static __device__ void RleEncode(
       }
     } else {
       // New run or in a literal run
-      uint32_t v0      = s->vals[pos & (rle_buffer_size - 1)];
-      uint32_t v1      = s->vals[(pos + 1) & (rle_buffer_size - 1)];
+      uint32_t v0      = s->vals[rolling_idx(pos)];
+      uint32_t v1      = s->vals[rolling_idx(pos + 1)];
       uint32_t mask    = ballot(pos + 1 < numvals && v0 == v1);
-      uint32_t maxvals = min(numvals - rle_pos, 128);
+      uint32_t maxvals = min(numvals - rle_pos, encode_block_size);
       uint32_t rle_lit_count, rle_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         // Repeat run can only start on a multiple of 8 values
-        uint32_t idx8        = (t * 8) >> 5;
-        uint32_t pos8        = (t * 8) & 0x1f;
+        uint32_t idx8        = (t * 8) / warp_size;
+        uint32_t pos8        = (t * 8) % warp_size;
         uint32_t m0          = (idx8 < 4) ? s->rpt_map[idx8] : 0;
         uint32_t m1          = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0;
         uint32_t needed_mask = kRleRunMask[nbits - 1];
@@ -847,8 +958,8 @@ static __device__ void RleEncode(
           uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals;
           uint32_t rpt_len       = 0;
           if (rle_run_start < maxvals) {
-            uint32_t idx_cur = rle_run_start >> 5;
-            uint32_t idx_ofs = rle_run_start & 0x1f;
+            uint32_t idx_cur = rle_run_start / warp_size;
+            uint32_t idx_ofs = rle_run_start % warp_size;
             while (idx_cur < 4) {
               m0   = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0;
               m1   = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0;
@@ -857,7 +968,7 @@ static __device__ void RleEncode(
                 rpt_len += __ffs(mask) - 1;
                 break;
               }
-              rpt_len += 32;
+              rpt_len += warp_size;
               idx_cur++;
             }
           }
@@ -928,7 +1039,7 @@ static __device__ void RleEncode(
  * @param[in] flush nonzero if last batch in block
  * @param[in] t thread id (0..127)
  */
-static __device__ void PlainBoolEncode(page_enc_state_s* s,
+static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
                                        uint32_t numvals,
                                        uint32_t flush,
                                        uint32_t t)
@@ -938,7 +1049,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s,
 
   while (rle_pos < numvals) {
     uint32_t pos    = rle_pos + t;
-    uint32_t v      = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0;
+    uint32_t v      = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0;
     uint32_t n      = min(numvals - rle_pos, 128);
     uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3;
     if (!nbytes) { break; }
@@ -992,28 +1103,22 @@ __device__ auto julian_days_with_time(int64_t v)
   return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
+// this has been split out into its own kernel because of the amount of shared memory required
+// for the state buffer. encode kernels that don't use the RLE buffer can get started while
+// the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<EncPage> pages,
-                 device_span<device_span<uint8_t const>> comp_in,
-                 device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
+                                                                     bool write_v2_headers,
+                                                                     encode_kernel_mask kernel_mask)
 {
-  __shared__ __align__(8) page_enc_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
 
-  page_enc_state_s* const s = &state_g;
-  auto const t              = threadIdx.x;
+  auto* const s    = &state_g;
+  uint32_t const t = threadIdx.x;
 
   if (t == 0) {
-    state_g = page_enc_state_s{};
+    state_g = rle_page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1026,6 +1131,8 @@ __global__ void __launch_bounds__(128, 8)
   }
   __syncthreads();
 
+  if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; }
+
   auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
 
   // Encode Repetition and Definition levels
@@ -1078,23 +1185,24 @@ __global__ void __launch_bounds__(128, 8)
           } while (is_col_struct);
           return def;
         }();
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
+        s->vals[rolling_idx(rle_numvals + t)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
         RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur                = rle_out;
           s->page.def_lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     }
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
@@ -1121,29 +1229,121 @@ __global__ void __launch_bounds__(128, 8)
         uint32_t idx         = page_first_val_idx + rle_numvals + t;
         uint32_t lvl_val =
           (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0;
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val;
+        s->vals[rolling_idx(rle_numvals + t)] = lvl_val;
         __syncthreads();
         rle_numvals += nvals;
         RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur    = rle_out;
           lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     };
     encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
     encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
+
+  if (t == 0) { pages[blockIdx.x] = s->page; }
+}
+
+template <int block_size, typename state_buf>
+__device__ void finish_page_encode(state_buf* s,
+                                   uint32_t valid_count,
+                                   uint8_t const* end_ptr,
+                                   device_span<EncPage> pages,
+                                   device_span<device_span<uint8_t const>> comp_in,
+                                   device_span<device_span<uint8_t>> comp_out,
+                                   device_span<compression_result> comp_results,
+                                   bool write_v2_headers)
+{
+  auto const t = threadIdx.x;
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size =
+    write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0;
+
+  if (t == 0) {
+    // only need num_nulls for v2 data page headers
+    if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; }
+    uint8_t const* const base   = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(end_ptr - base);
+    if (actual_data_size > s->page.max_data_size) {
+      CUDF_UNREACHABLE("detected possible page data corruption");
+    }
+    s->page.max_data_size = actual_data_size;
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
+    }
+    pages[blockIdx.x] = s->page;
+    if (not comp_results.empty()) {
+      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    }
+  }
+
+  // copy uncompressed bytes over
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+// PLAIN page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodePages(device_span<EncPage> pages,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
+{
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+
   // Encode data values
   __syncthreads();
   auto const physical_type = s->col.physical_type;
@@ -1155,10 +1355,6 @@ __global__ void __launch_bounds__(128, 8)
     return dtype_len_out;
   }();
 
-  auto const dict_bits = (physical_type == BOOLEAN) ? 1
-                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
-                           ? s->ck.dict_rle_bits
-                           : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1167,219 +1363,315 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_out     = dst;
     s->page.encoding =
       determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
-    if (dict_bits >= 0 && physical_type != BOOLEAN) {
-      dst[0]     = dict_bits;
-      s->rle_out = dst + 1;
-    } else if (is_v2 && physical_type == BOOLEAN) {
-      // save space for RLE length. we don't know the total length yet.
-      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
-      s->rle_len_pos = dst;
-    }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+
   uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
 
     auto [is_valid, val_idx] = [&]() {
       uint32_t val_idx;
       uint32_t is_valid;
 
-      size_type val_idx_in_block = cur_val_idx + t;
+      size_type const val_idx_in_block = cur_val_idx + t;
       if (s->page.page_type == PageType::DICTIONARY_PAGE) {
         val_idx  = val_idx_in_block;
         is_valid = (val_idx < s->page.num_leaf_values);
         if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
       } else {
-        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+        size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
 
         is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
                     val_idx_in_block < s->page.num_leaf_values)
                      ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
                      : 0;
-        val_idx =
-          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+        val_idx  = val_idx_in_leaf_col;
       }
       return std::make_tuple(is_valid, val_idx);
     }();
 
-    if (is_valid) num_valid++;
-
+    if (is_valid) { num_valid++; }
     cur_val_idx += nvals;
-    if (dict_bits >= 0) {
-      // Dictionary encoding
-      if (dict_bits > 0) {
-        uint32_t rle_numvals;
-        uint32_t rle_numvals_in_block;
-        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
-        rle_numvals = s->rle_numvals;
-        if (is_valid) {
-          uint32_t v;
-          if (physical_type == BOOLEAN) {
-            v = s->col.leaf_column->element<uint8_t>(val_idx);
-          } else {
-            v = s->ck.dict_index[val_idx];
-          }
-          s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
-        }
-        rle_numvals += rle_numvals_in_block;
-        __syncthreads();
-        if (!is_v2 && physical_type == BOOLEAN) {
-          PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
-        } else {
-          RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
+
+    // Non-dictionary encoding
+    uint8_t* dst = s->cur;
+
+    if (is_valid) {
+      len = dtype_len_out;
+      if (physical_type == BYTE_ARRAY) {
+        if (type_id == type_id::STRING) {
+          len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+        } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
+          len +=
+            get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         }
-        __syncthreads();
       }
-      if (t == 0) { s->cur = s->rle_out; }
-      __syncthreads();
     } else {
-      // Non-dictionary encoding
-      uint8_t* dst = s->cur;
-
-      if (is_valid) {
-        len = dtype_len_out;
-        if (physical_type == BYTE_ARRAY) {
-          if (type_id == type_id::STRING) {
-            len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
-          } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
-            len +=
-              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
+      len = 0;
+    }
+    uint32_t total_len = 0;
+    block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
+    __syncthreads();
+    if (t == 0) { s->cur = dst + total_len; }
+    if (is_valid) {
+      switch (physical_type) {
+        case INT32: [[fallthrough]];
+        case FLOAT: {
+          auto const v = [dtype_len = dtype_len_in,
+                          idx       = val_idx,
+                          col       = s->col.leaf_column,
+                          scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
+            switch (dtype_len) {
+              case 8: return col->element<int64_t>(idx) * scale;
+              case 4: return col->element<int32_t>(idx) * scale;
+              case 2: return col->element<int16_t>(idx) * scale;
+              default: return col->element<int8_t>(idx) * scale;
+            }
+          }();
+
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+        } break;
+        case INT64: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
+          }
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          dst[pos + 4] = v >> 32;
+          dst[pos + 5] = v >> 40;
+          dst[pos + 6] = v >> 48;
+          dst[pos + 7] = v >> 56;
+        } break;
+        case INT96: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
           }
-        }
-      } else {
-        len = 0;
-      }
-      uint32_t total_len = 0;
-      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
-      __syncthreads();
-      if (t == 0) { s->cur = dst + total_len; }
-      if (is_valid) {
-        switch (physical_type) {
-          case INT32: [[fallthrough]];
-          case FLOAT: {
-            auto const v = [dtype_len = dtype_len_in,
-                            idx       = val_idx,
-                            col       = s->col.leaf_column,
-                            scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
-              switch (dtype_len) {
-                case 8: return col->element<int64_t>(idx) * scale;
-                case 4: return col->element<int32_t>(idx) * scale;
-                case 2: return col->element<int16_t>(idx) * scale;
-                default: return col->element<int8_t>(idx) * scale;
-              }
-            }();
 
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-          } break;
-          case INT64: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+          auto const [last_day_nanos, julian_days] = [&] {
+            using namespace cuda::std::chrono;
+            switch (s->col.leaf_column->type().id()) {
+              case type_id::TIMESTAMP_SECONDS:
+              case type_id::TIMESTAMP_MILLISECONDS: {
+                return julian_days_with_time<cuda::std::milli>(v);
+              } break;
+              case type_id::TIMESTAMP_MICROSECONDS:
+              case type_id::TIMESTAMP_NANOSECONDS: {
+                return julian_days_with_time<cuda::std::micro>(v);
+              } break;
             }
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            dst[pos + 4] = v >> 32;
-            dst[pos + 5] = v >> 40;
-            dst[pos + 6] = v >> 48;
-            dst[pos + 7] = v >> 56;
-          } break;
-          case INT96: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+            return julian_days_with_time<cuda::std::nano>(0);
+          }();
+
+          // the 12 bytes of fixed length data.
+          v             = last_day_nanos.count();
+          dst[pos + 0]  = v;
+          dst[pos + 1]  = v >> 8;
+          dst[pos + 2]  = v >> 16;
+          dst[pos + 3]  = v >> 24;
+          dst[pos + 4]  = v >> 32;
+          dst[pos + 5]  = v >> 40;
+          dst[pos + 6]  = v >> 48;
+          dst[pos + 7]  = v >> 56;
+          uint32_t w    = julian_days.count();
+          dst[pos + 8]  = w;
+          dst[pos + 9]  = w >> 8;
+          dst[pos + 10] = w >> 16;
+          dst[pos + 11] = w >> 24;
+        } break;
+
+        case DOUBLE: {
+          auto v = s->col.leaf_column->element<double>(val_idx);
+          memcpy(dst + pos, &v, 8);
+        } break;
+        case BYTE_ARRAY: {
+          auto const bytes = [](cudf::type_id const type_id,
+                                column_device_view const* leaf_column,
+                                uint32_t const val_idx) -> void const* {
+            switch (type_id) {
+              case type_id::STRING:
+                return reinterpret_cast<void const*>(
+                  leaf_column->element<string_view>(val_idx).data());
+              case type_id::LIST:
+                return reinterpret_cast<void const*>(
+                  get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
+              default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
+          }(type_id, s->col.leaf_column, val_idx);
+          uint32_t v   = len - 4;  // string length
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          if (v != 0) memcpy(dst + pos + 4, bytes, v);
+        } break;
+        case FIXED_LEN_BYTE_ARRAY: {
+          if (type_id == type_id::DECIMAL128) {
+            // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+            auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+            auto const v_char_ptr = reinterpret_cast<char const*>(&v);
+            thrust::copy(thrust::seq,
+                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                         thrust::make_reverse_iterator(v_char_ptr),
+                         dst + pos);
+          }
+        } break;
+      }
+    }
+    __syncthreads();
+  }
 
-            auto const [last_day_nanos, julian_days] = [&] {
-              using namespace cuda::std::chrono;
-              switch (s->col.leaf_column->type().id()) {
-                case type_id::TIMESTAMP_SECONDS:
-                case type_id::TIMESTAMP_MILLISECONDS: {
-                  return julian_days_with_time<cuda::std::milli>(v);
-                } break;
-                case type_id::TIMESTAMP_MICROSECONDS:
-                case type_id::TIMESTAMP_NANOSECONDS: {
-                  return julian_days_with_time<cuda::std::micro>(v);
-                } break;
-              }
-              return julian_days_with_time<cuda::std::nano>(0);
-            }();
-
-            // the 12 bytes of fixed length data.
-            v             = last_day_nanos.count();
-            dst[pos + 0]  = v;
-            dst[pos + 1]  = v >> 8;
-            dst[pos + 2]  = v >> 16;
-            dst[pos + 3]  = v >> 24;
-            dst[pos + 4]  = v >> 32;
-            dst[pos + 5]  = v >> 40;
-            dst[pos + 6]  = v >> 48;
-            dst[pos + 7]  = v >> 56;
-            uint32_t w    = julian_days.count();
-            dst[pos + 8]  = w;
-            dst[pos + 9]  = w >> 8;
-            dst[pos + 10] = w >> 16;
-            dst[pos + 11] = w >> 24;
-          } break;
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
 
-          case DOUBLE: {
-            auto v = s->col.leaf_column->element<double>(val_idx);
-            memcpy(dst + pos, &v, 8);
-          } break;
-          case BYTE_ARRAY: {
-            auto const bytes = [](cudf::type_id const type_id,
-                                  column_device_view const* leaf_column,
-                                  uint32_t const val_idx) -> void const* {
-              switch (type_id) {
-                case type_id::STRING:
-                  return reinterpret_cast<void const*>(
-                    leaf_column->element<string_view>(val_idx).data());
-                case type_id::LIST:
-                  return reinterpret_cast<void const*>(
-                    get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
-                default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
-              }
-            }(type_id, s->col.leaf_column, val_idx);
-            uint32_t v   = len - 4;  // string length
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            if (v != 0) memcpy(dst + pos + 4, bytes, v);
-          } break;
-          case FIXED_LEN_BYTE_ARRAY: {
-            if (type_id == type_id::DECIMAL128) {
-              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
-              auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
-              auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-              thrust::copy(thrust::seq,
-                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                           thrust::make_reverse_iterator(v_char_ptr),
-                           dst + pos);
-            }
-          } break;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DICTIONARY page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDictPages(device_span<EncPage> pages,
+                     device_span<device_span<uint8_t const>> comp_in,
+                     device_span<device_span<uint8_t>> comp_out,
+                     device_span<compression_result> comp_results,
+                     bool write_v2_headers)
+{
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = rle_page_enc_state_s{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  // TODO assert dict_bits >= 0
+  auto const dict_bits = (physical_type == BOOLEAN) ? 1
+                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                           ? s->ck.dict_rle_bits
+                           : -1;
+  if (t == 0) {
+    uint8_t* dst   = s->cur;
+    s->rle_run     = 0;
+    s->rle_pos     = 0;
+    s->rle_numvals = 0;
+    s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    if (dict_bits >= 0 && physical_type != BOOLEAN) {
+      dst[0]     = dict_bits;
+      s->rle_out = dst + 1;
+    } else if (write_v2_headers && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
+    }
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  uint32_t num_valid = 0;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
+
+    auto [is_valid, val_idx] = [&]() {
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                                 val_idx_in_block < s->page.num_leaf_values)
+                                  ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                                  : 0;
+      // need to test for use_dictionary because it might be boolean
+      uint32_t const val_idx =
+        (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      return std::make_tuple(is_valid, val_idx);
+    }();
+
+    if (is_valid) { num_valid++; }
+    cur_val_idx += nvals;
+
+    // Dictionary encoding
+    if (dict_bits > 0) {
+      uint32_t rle_numvals;
+      uint32_t rle_numvals_in_block;
+      uint32_t pos;
+      block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+      rle_numvals = s->rle_numvals;
+      if (is_valid) {
+        uint32_t v;
+        if (physical_type == BOOLEAN) {
+          v = s->col.leaf_column->element<uint8_t>(val_idx);
+        } else {
+          v = s->ck.dict_index[val_idx];
         }
+        s->vals[rolling_idx(rle_numvals + pos)] = v;
+      }
+      rle_numvals += rle_numvals_in_block;
+      __syncthreads();
+      if ((!write_v2_headers) && (physical_type == BOOLEAN)) {
+        PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
+      } else {
+        RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
       }
       __syncthreads();
     }
+    if (t == 0) { s->cur = s->rle_out; }
+    __syncthreads();
   }
 
   uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
@@ -1392,37 +1684,137 @@ __global__ void __launch_bounds__(128, 8)
     __syncwarp();
   }
 
-  // V2 does not compress rep and def level data
-  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DELTA_BINARY_PACKED page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<compression_result> comp_results)
+{
+  // block of shared memory for value storage and bit packing
+  __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size];
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename delta::index_scan::TempStorage delta_index_tmp;
+    typename delta::block_reduce::TempStorage delta_reduce_tmp;
+    typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks];
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
 
   if (t == 0) {
-    s->page.num_nulls           = s->page.num_values - valid_count;
-    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
-    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
-    if (actual_data_size > s->page.max_data_size) {
-      CUDF_UNREACHABLE("detected possible page data corruption");
-    }
-    s->page.max_data_size = actual_data_size;
-    if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
-                              0};  // size is unused
-    }
-    pages[blockIdx.x] = s->page;
-    if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
-      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  if (t == 0) {
+    uint8_t* dst       = s->cur;
+    s->rle_run         = 0;
+    s->rle_pos         = 0;
+    s->rle_numvals     = 0;
+    s->rle_out         = dst;
+    s->page.encoding   = Encoding::DELTA_BINARY_PACKED;
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  // need to know the number of valid values for the null values calculation and to size
+  // the delta binary encoder.
+  uint32_t valid_count = 0;
+  if (not s->col.leaf_column->nullable()) {
+    valid_count = s->page.num_leaf_values;
+  } else {
+    uint32_t num_valid = 0;
+    for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+      uint32_t const nvals                = min(s->page.num_leaf_values - cur_val_idx, block_size);
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      if (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+          val_idx_in_block < s->page.num_leaf_values &&
+          s->col.leaf_column->is_valid(val_idx_in_leaf_col)) {
+        num_valid++;
+      }
+      cur_val_idx += nvals;
     }
+    valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
   }
 
-  // copy over uncompressed data
-  if (skip_comp_size != 0 && not comp_in.empty()) {
-    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
-    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
-    for (int i = t; i < skip_comp_size; i += block_size) {
-      dst[i] = src[i];
+  uint8_t const* delta_ptr = nullptr;  // this will be the end of delta block pointer
+
+  if (physical_type == INT32) {
+    switch (dtype_len_in) {
+      case 8: {
+        // only DURATIONS map to 8 bytes, so safe to just use signed here?
+        delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
+        break;
+      }
+      case 4: {
+        if (type_id == type_id::UINT32) {
+          delta_ptr = delta_encode<uint32_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int32_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 2: {
+        if (type_id == type_id::UINT16) {
+          delta_ptr = delta_encode<uint16_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int16_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 1: {
+        if (type_id == type_id::UINT8) {
+          delta_ptr = delta_encode<uint8_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int8_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED");
+    }
+  } else {
+    if (type_id == type_id::UINT64) {
+      delta_ptr = delta_encode<uint64_t>(s, valid_count, delta_shared, &temp_storage);
+    } else {
+      delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
     }
   }
+
+  finish_page_encode<block_size>(
+    s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true);
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1457,7 +1849,8 @@ __global__ void __launch_bounds__(decide_compression_block_size)
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
-    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
+    auto const is_v2          = curr_page.page_type == PageType::DATA_PAGE_V2;
+    auto const lvl_bytes      = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
@@ -1920,7 +2313,8 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
-      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
+      auto const is_v2     = page_g.page_type == PageType::DATA_PAGE_V2;
+      auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0;
       hdr_start            = page_g.compressed_data;
       compressed_page_size =
         static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
@@ -2155,6 +2549,10 @@ constexpr __device__ void* align8(void* ptr)
   return static_cast<char*>(ptr) - algn;
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(EncPage const& p) { return static_cast<uint32_t>(p.kernel_mask); }
+};
+
 }  // namespace
 
 // blockDim(1, 1, 1)
@@ -2257,8 +2655,9 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             rmm::cuda_stream_view stream)
 {
   int const num_fragments = fragments.size();
-  int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size);
-  gpuInitFragmentStats<<<dim, 128, 0, stream.value()>>>(groups, fragments);
+  int const dim =
+    util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size);
+  gpuInitFragmentStats<<<dim, encode_block_size, 0, stream.value()>>>(groups, fragments);
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
@@ -2277,18 +2676,18 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 {
   auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
-  gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(chunks,
-                                                     pages,
-                                                     page_sizes,
-                                                     comp_page_sizes,
-                                                     col_desc,
-                                                     page_grstats,
-                                                     chunk_grstats,
-                                                     num_columns,
-                                                     max_page_size_bytes,
-                                                     max_page_size_rows,
-                                                     page_align,
-                                                     write_v2_headers);
+  gpuInitPages<<<dim_grid, encode_block_size, 0, stream.value()>>>(chunks,
+                                                                   pages,
+                                                                   page_sizes,
+                                                                   comp_page_sizes,
+                                                                   col_desc,
+                                                                   page_grstats,
+                                                                   chunk_grstats,
+                                                                   num_columns,
+                                                                   max_page_size_bytes,
+                                                                   max_page_size_rows,
+                                                                   page_align,
+                                                                   write_v2_headers);
 }
 
 void EncodePages(device_span<EncPage> pages,
@@ -2299,10 +2698,43 @@ void EncodePages(device_span<EncPage> pages,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
+
+  // determine which kernels to invoke
+  auto mask_iter       = thrust::make_transform_iterator(pages.begin(), mask_tform{});
+  uint32_t kernel_mask = thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
+
+  // get the number of streams we need from the pool
+  int nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams = cudf::detail::fork_streams(stream, nkernels);
+
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
-    pages, comp_in, comp_out, comp_results, write_v2_headers);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::PLAIN);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY);
+    gpuEncodeDeltaBinaryPages<encode_block_size>
+      <<<num_pages, encode_block_size, 0, strm.value()>>>(pages, comp_in, comp_out, comp_results);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DICTIONARY);
+    gpuEncodeDictPages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+
+  cudf::detail::join_streams(streams, stream);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2320,7 +2752,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
-  gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
+  gpuEncodePageHeaders<<<pages.size(), encode_block_size, 0, stream.value()>>>(
     pages, comp_results, page_stats, chunk_stats);
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 6a93fec0c46..048f1a73a9c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -88,6 +88,37 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
+// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// this is in C++23
+#if !defined(__cpp_lib_is_scoped_enum)
+template <typename Enum, bool = std::is_enum_v<Enum>>
+struct is_scoped_enum {
+  static const bool value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
+};
+
+template <typename Enum>
+struct is_scoped_enum<Enum, false> {
+  static const bool value = false;
+};
+#else
+using std::is_scoped_enum;
+#endif
+
+// helpers to do bit operations on scoped enums
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitAnd(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
+}
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -371,6 +402,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
   return 1 << static_cast<uint32_t>(encoding);
 }
 
+/**
+ * @brief Enum of mask bits for the EncPage kernel_mask
+ *
+ * Used to control which encode kernels to run.
+ */
+enum class encode_kernel_mask {
+  PLAIN        = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY   = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY = (1 << 2)   // Run DELTA_BINARY_PACKED encoding kernel
+};
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -429,10 +471,11 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
-  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
-  compression_result* comp_res;  //!< Ptr to compression result
-  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
+  uint32_t def_lvl_bytes;          //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;          //!< Number of bytes of encoded repetition level data (V2 only)
+  compression_result* comp_res;    //!< Ptr to compression result
+  uint32_t num_nulls;              //!< Number of null values (V2 only) (down here for alignment)
+  encode_kernel_mask kernel_mask;  //!< Mask used to control which encoding kernels to run
 };
 
 /**
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4733a5d63a8..cd2bc493bc7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
@@ -274,12 +275,13 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (!search_key.is_valid(stream)) {
-    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                               lists.size(),
-                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
-                               lists.size(),
-                               stream,
-                               mr);
+    return make_numeric_column(
+      data_type{cudf::type_to_id<size_type>()},
+      lists.size(),
+      cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr),
+      lists.size(),
+      stream,
+      mr);
   }
   if (lists.size() == 0) {
     return make_numeric_column(
@@ -337,7 +339,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
                                     lists.size(),
-                                    copy_bitmask(lists_cv, stream, mr),
+                                    cudf::detail::copy_bitmask(lists_cv, stream, mr),
                                     lists_cv.null_count(),
                                     stream,
                                     mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index c0765b48205..00a2f0bee8f 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -381,7 +381,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   // materialize the output buffer
   rmm::device_buffer validity =
     lcol.has_nulls() || rcol.has_nulls()
-      ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
+      ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
       : rmm::device_buffer{};
   if (lcol.has_nulls() || rcol.has_nulls()) {
     materialize_bitmask(lcol,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 41cce57d55b..8a6367a1f87 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -219,8 +219,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (decimal_places >= 0 && std::is_integral_v<T>)
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
   T const n     = std::pow(10, std::abs(decimal_places));
@@ -256,8 +260,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (input.type().scale() > -decimal_places)
     return cudf::detail::cast(input, result_type, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(result_type,
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 85971647434..b8c7d058535 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -42,7 +42,7 @@ struct contains_column_dispatch {
                                      stream,
                                      mr);
     return std::make_unique<column>(
-      std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count());
+      std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count());
   }
 };
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index ee47932100a..f80ace57c69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -383,7 +383,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                              std::move(offsets),
                              std::move(chars->release().children.back()),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr));
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 913aec79758..045aac279e6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -290,7 +290,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 52f27c68111..7a0cfb9ef41 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -57,7 +57,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                              std::move(offsets),
                              std::move(results),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
                              stream,
                              mr);
   }
@@ -72,7 +72,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_child),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -160,7 +160,7 @@ std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& in
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 1c81f266200..6fa87b1f709 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -194,7 +194,7 @@ std::unique_ptr<column> rescale(column_view input,
       auto const scalar  = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
       auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr);
       if (input.nullable()) {
-        auto const null_mask = copy_bitmask(input, stream, mr);
+        auto const null_mask = detail::copy_bitmask(input, stream, mr);
         output_column->set_null_mask(std::move(null_mask), input.null_count());
       }
       return output_column;
@@ -255,7 +255,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -285,7 +285,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -334,7 +334,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream, mr),
+                                             detail::copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d0cae81a9c8..d84e0171b49 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -291,8 +291,12 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
        std::is_same_v<FixedPointUnaryOpFunctor, fixed_point_floor<Type>>))
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3e30db7abcb..16e7239ebd8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,6 +629,7 @@ ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 2ddb0b76544..0149a467c32 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1957,11 +1957,11 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
     // 2 -> (invalid)
     R"({"b":{"a":[321})"
     "\n"
-    // 3 -> c: [1] (valid)
+    // 3 -> c: 1.2 (valid)
     R"({"c":1.2})"
     "\n"
     "\n"
-    // 4 -> a: 123 (valid)
+    // 4 -> a: 4 (valid)
     R"({"a":4})"
     "\n"
     // 5 -> (invalid)
@@ -2020,4 +2020,71 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
                     c_validity.cbegin()});
 }
 
+TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
+{
+  /**
+   * @brief Spark has the specific need to ignore extra characters that come after the first record
+   * on a JSON line
+   */
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2}{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid)
+    R"({"c":1.2 } )"
+    "\n"
+    "\n"
+    // 4 -> (valid)
+    R"({"a":4} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> (valid)
+    R"({"a":6} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 8);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true, true, true, false};
+  std::vector<bool> b_validity{false, false, true, false, false, false, false, false};
+  std::vector<bool> c_validity{false, false, false, true, false, false, false, false};
+
+  // Child column b->a
+  auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 3cb7e1f287a..5f79d5b862b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -543,7 +543,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":-2},)"
+  std::string const input = R"({"a":2 {})"
                             // 9
                             "\n"
                             // 01234 <= line 1
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index fa85e3a4a1d..2a654bd7e8c 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -353,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
 template <typename T>
 struct ParquetReaderSourceTest : public ParquetReaderTest {};
 
+template <typename T>
+struct ParquetWriterDeltaTest : public ParquetWriterTest {};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
@@ -384,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
-// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
 INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          ParquetSizedTest,
                          testing::Range(1, 25),
@@ -6698,7 +6700,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   // data should be PLAIN for v1, RLE for V2
   auto col0_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
-  // data should be PLAIN for both
+  // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2
   auto col1_data = random_values<int32_t>(num_rows);
   // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
   auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
@@ -6733,10 +6735,10 @@ TEST_P(ParquetV2Test, CheckEncodings)
     // col0 should have RLE for rep/def and data
     EXPECT_TRUE(chunk0_enc.size() == 1);
     EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
-    // col1 should have RLE for rep/def and PLAIN for data
+    // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data
     EXPECT_TRUE(chunk1_enc.size() == 2);
     EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
-    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED));
     // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
     EXPECT_TRUE(chunk2_enc.size() == 3);
     EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
@@ -6758,6 +6760,104 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly.
+// see definition of UnsupportedChronoTypes above.
+using DeltaDecimalTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using DeltaBinaryTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::ChronoTypes, DeltaDecimalTypes>;
+using SupportedDeltaTestTypes =
+  cudf::test::RemoveIf<cudf::test::ContainedIn<UnsupportedChronoTypes>, DeltaBinaryTypes>;
+TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes);
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes)
+{
+  using T   = TypeParam;
+  auto col0 = testdata::ascending<T>();
+  auto col1 = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced)
+{
+  using T                = TypeParam;
+  constexpr int num_rows = 4'000;
+  auto col0              = testdata::ascending<T>();
+  auto col1              = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+  auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_rows);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
+{
+  using T = TypeParam;
+
+  constexpr int num_slice = 4'000;
+  constexpr int num_rows  = 32 * 1024;
+
+  std::mt19937 gen(6542);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  // list<T>
+  constexpr int vals_per_row = 4;
+  auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
+    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<T> c1_vals(
+    values, values + (num_rows * vals_per_row), valids);
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+  auto _c1 = cudf::make_lists_column(
+    num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask));
+  auto c1 = cudf::purge_nonempty_nulls(*_c1);
+
+  auto const expected = table_view{{*c1}};
+  auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_slice);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
 TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
 {
   char const* const min_val = "";
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
new file mode 100644
index 00000000000..7e59201c8cf
--- /dev/null
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class NullMaskTest : public cudf::test::BaseFixture {};
+
+TEST_F(NullMaskTest, CreateNullMask)
+{
+  cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, SetNullMask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> col({0, 1, 0, 1, 1},
+                                                   {true, false, true, false, false});
+
+  cudf::set_null_mask(static_cast<cudf::mutable_column_view>(col).null_mask(),
+                      0,
+                      3,
+                      false,
+                      cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 3, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmaskFromColumn)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(col, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskAnd)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_and(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskOr)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_or(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, NullCount)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, true, false, false, true});
+
+  cudf::null_count(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 4, cudf::test::get_default_stream());
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index c19e8765be3..e8114fa5615 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -62,6 +62,7 @@ files:
     includes:
       - cudatoolkit
       - docs
+      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -225,7 +226,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==12.0.1.*
+          - &libarrow libarrow==12.0.0.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -243,7 +244,7 @@ dependencies:
           - cython>=3.0.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==12.0.1.*
+          - &pyarrow pyarrow==12.0.0.*
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
   build_python:
@@ -260,16 +261,14 @@ dependencies:
           - protoc-wheel
   libarrow_run:
     common:
-      - output_types: [conda, requirements]
+      - output_types: conda
         packages:
-          # Allow runtime version to float up to minor version
-          - libarrow==12.*
+          - *libarrow
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow==12.*
+          - *pyarrow
   cudatoolkit:
     specific:
       - output_types: conda
@@ -282,6 +281,9 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
+              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+              # cudf_kafka to use scikit-build
+              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 39a8dca0267..ccb5d5d4416 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==12.0.1.*",
+    "pyarrow==12.0.0.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==12.*",
+    "pyarrow==12.0.0.*",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 78a7a83ac3a..ff475e5a72e 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==12.0.1.*",
+    "pyarrow==12.0.0.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.