diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
index 8a71a90f3b..87aad7f034 100755
--- a/.github/workflows/auto-merge.yml
+++ b/.github/workflows/auto-merge.yml
@@ -18,25 +18,57 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-      - branch-24.08
+      - branch-*
     types: [closed]
 
-env:
-  HEAD: branch-24.08
-  BASE: branch-24.10
-
 jobs:
-  auto-merge:
+  auto-merge: # TODO: use spark-rapids-common shared action when available
     if: github.event.pull_request.merged == true
     runs-on: ubuntu-latest
 
     steps:
+      - name: set HEAD ENV
+        run: echo "HEAD=${{ github.event.pull_request.base.ref }}" >> $GITHUB_ENV
+
+      - name: Generate target branch
+        run: |
+          current_branch="${{ env.HEAD }}"
+          version=${current_branch#branch-}
+
+          IFS='.' read -r -a parts <<< "$version"
+          year=${parts[0]}
+          month=${parts[1]}
+          month=$((10#$month + 2))
+          if [ $month -gt 12 ]; then
+            month=$((month - 12))
+            year=$((year + 1))
+          fi
+
+          next_release=$(printf "%02d.%02d" $year $month)
+          echo "Next release is $next_release"
+          echo "BASE=branch-$next_release" >> $GITHUB_ENV
+
+      - name: Check if target branch exists
+        run: |
+          CODE=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: token ${{ secrets.AUTOMERGE_TOKEN }}" \
+            https://api.github.com/repos/${{ github.repository }}/branches/${{ env.BASE }})
+          echo "Response code: $CODE..."
+
+          if [ $CODE -eq 200 ]; then
+            echo "branch_exists=true" >> $GITHUB_ENV
+          else
+            echo "branch_exists=false" >> $GITHUB_ENV
+            echo "Failed to find ${{ env.BASE }}. Skip auto-merge..."
+          fi
+
       - uses: actions/checkout@v4
+        if: env.branch_exists == 'true'
         with:
           ref: ${{ env.HEAD }} # force to fetch from latest upstream instead of PR ref
           token: ${{ secrets.AUTOMERGE_TOKEN }} # workaround auto-merge token to avoid GITHUB_TOKEN insufficient permission
 
       - name: push intermediate branch for auto-merge
+        if: env.branch_exists == 'true'
         run: |
           git config user.name "spark-rapids automation"
           git config user.email "70000568+nvauto@users.noreply.github.com "
@@ -55,6 +87,7 @@ jobs:
           FILE_USE_BASE: thirdparty/cudf thirdparty/cudf-pins
 
       - name: auto-merge job
+        if: env.branch_exists == 'true'
         uses: ./.github/workflows/action-helper
         with:
           operator: auto-merge
@@ -64,4 +97,3 @@ jobs:
           HEAD: bot-auto-merge-${{ env.HEAD }}
           BASE: ${{ env.BASE }}
           TOKEN: ${{ secrets.AUTOMERGE_TOKEN }}
-
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index a17fbddd83..d58c76824d 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -68,7 +68,9 @@ jobs:
         github.actor == 'binmahone' ||
         github.actor == 'pmattione-nvidia' ||
         github.actor == 'Feng-Jiang28' ||
-        github.actor == 'pxLi'
+        github.actor == 'pxLi' ||
+        github.actor == 'ustcfy' ||
+        github.actor == 'zpuller'
       )
     steps:
       - name: Check if comment is issued by authorized person
diff --git a/.github/workflows/signoff-check.yml b/.github/workflows/signoff-check.yml
index 4a0eece7d1..c08911350b 100755
--- a/.github/workflows/signoff-check.yml
+++ b/.github/workflows/signoff-check.yml
@@ -23,12 +23,10 @@ jobs:
   signoff-check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-
-      - name: sigoff-check job
-        uses: ./.github/workflows/signoff-check
-        env:
-          OWNER: NVIDIA
-          REPO_NAME: spark-rapids-jni
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PULL_NUMBER: ${{ github.event.number }}
+      - name: signoff
+        uses: NVIDIA/spark-rapids-common/signoff-check@main
+        with:
+          owner: ${{ github.repository_owner }}
+          repo: spark-rapids-jni
+          pull_number: ${{ github.event.number }}
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/signoff-check/Dockerfile b/.github/workflows/signoff-check/Dockerfile
deleted file mode 100755
index 6d91c6c832..0000000000
--- a/.github/workflows/signoff-check/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-FROM python:3.8-slim-buster
-
-WORKDIR /
-COPY signoff-check .
-RUN pip install PyGithub && chmod +x /signoff-check
-
-# require envs: OWNER,REPO_NAME,GITHUB_TOKEN,PULL_NUMBER
-ENTRYPOINT ["/signoff-check"]
diff --git a/.github/workflows/signoff-check/action.yml b/.github/workflows/signoff-check/action.yml
deleted file mode 100755
index 8b608f8717..0000000000
--- a/.github/workflows/signoff-check/action.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: 'signoff check action'
-description: 'check if PR got signed off'
-runs:
-  using: 'docker'
-  image: 'Dockerfile'
diff --git a/.github/workflows/signoff-check/signoff-check b/.github/workflows/signoff-check/signoff-check
deleted file mode 100755
index e1b451efe2..0000000000
--- a/.github/workflows/signoff-check/signoff-check
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""A signoff check
-
-The tool checks if any commit got signoff in a pull request.
-
-NOTE: this script is for github actions only, you should not use it anywhere else.
-"""
-import os
-import re
-import sys
-from argparse import ArgumentParser
-
-from github import Github
-
-SIGNOFF_REGEX = re.compile('Signed-off-by:')
-
-
-def signoff(token: str, owner: str, repo_name: str, pull_number: int):
-    gh = Github(token, per_page=100, user_agent='signoff-check', verify=True)
-    pr = gh.get_repo(f"{owner}/{repo_name}").get_pull(pull_number)
-    for c in pr.get_commits():
-        if SIGNOFF_REGEX.search(c.commit.message):
-            print('Found signoff.\n')
-            print(f"Commit sha:\n{c.commit.sha}")
-            print(f"Commit message:\n{c.commit.message}")
-            return True
-    return False
-
-
-def main(token: str, owner: str, repo_name: str, pull_number: int):
-    try:
-        if not signoff(token, owner, repo_name, pull_number):
-            raise Exception('No commits w/ signoff')
-    except Exception as e:  # pylint: disable=broad-except
-        print(e)
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    parser = ArgumentParser(description="signoff check")
-    parser.add_argument("--owner", help="repo owner", default='')
-    parser.add_argument("--repo_name", help="repo name", default='')
-    parser.add_argument("--token", help="github token, will use GITHUB_TOKEN if empty", default='')
-    parser.add_argument("--pull_number", help="pull request number", type=int)
-    args = parser.parse_args()
-
-    GITHUB_TOKEN = args.token if args.token else os.environ.get('GITHUB_TOKEN')
-    assert GITHUB_TOKEN, 'env GITHUB_TOKEN should not be empty'
-    OWNER = args.owner if args.owner else os.environ.get('OWNER')
-    assert OWNER, 'env OWNER should not be empty'
-    REPO_NAME = args.repo_name if args.repo_name else os.environ.get('REPO_NAME')
-    assert REPO_NAME, 'env REPO_NAME should not be empty'
-    PULL_NUMBER = args.pull_number if args.pull_number else int(os.environ.get('PULL_NUMBER'))
-    assert PULL_NUMBER, 'env PULL_NUMBER should not be empty'
-
-    main(token=GITHUB_TOKEN, owner=OWNER, repo_name=REPO_NAME, pull_number=PULL_NUMBER)
diff --git a/.gitmodules b/.gitmodules
index 862e1ef3e6..e2001c2c84 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
 [submodule "thirdparty/cudf"]
 	path = thirdparty/cudf
 	url = https://github.com/rapidsai/cudf.git
-	branch = branch-24.08
+	branch = branch-24.10
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7f83e2169b..271e62feb1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -165,7 +165,7 @@ $ ./build/build-in-docker install ...
 ```
 
 Now cd to ~/repos/NVIDIA/spark-rapids and build with one of the options from
-[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.08/CONTRIBUTING.md#building-from-source).
+[spark-rapids instructions](https://github.com/NVIDIA/spark-rapids/blob/branch-24.10/CONTRIBUTING.md#building-from-source).
 
 ```bash
 $ ./build/buildall
@@ -299,7 +299,7 @@ then run `cuda-gdb`. You do not necessarily need to run `cuda-gdb` in Docker):
 
 ```bash
 ./build/run-in-docker
-bash-4.2$ cuda-gdb target/cmake-build/gtests/ROW_CONVERSION
+bash-4.2$ cuda-gdb target/jni/cmake-build/gtests/ROW_CONVERSION
 ```
 
 You can also use the [NVIDIA Nsight VSCode Code Integration](https://docs.nvidia.com/nsight-visual-studio-code-edition/cuda-debugger/index.html)
diff --git a/build/apply-patches b/build/apply-patches
new file mode 100755
index 0000000000..991613e6dc
--- /dev/null
+++ b/build/apply-patches
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Run a command in a Docker container with devtoolset
+
+set -e
+
+BASE_DIR=$( git rev-parse --show-toplevel )
+
+PATCH_DIR=${PATCH_DIR:-$(realpath "$BASE_DIR/patches/")}
+
+CUDF_DIR=${CUDF_DIR:-$(realpath "$BASE_DIR/thirdparty/cudf/")}
+
+pushd "$CUDF_DIR"
+if [ -n "$(git status --porcelain --untracked-files=no)" ] ; then
+    echo "Error: CUDF repository has uncommitted changes. No patches will be applied..."
+    exit 1
+fi
+
+find "$PATCH_DIR" -maxdepth 1 -type f -print0 | sort -zV | while IFS= read -r -d '' file; do
+    echo "patching with: $file"
+    patch --no-backup-if-mismatch -f -t --reject-file=- -p1 -i "$file"
+done
+popd
diff --git a/build/unapply-patches b/build/unapply-patches
new file mode 100755
index 0000000000..186a781ade
--- /dev/null
+++ b/build/unapply-patches
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Run a command in a Docker container with devtoolset
+
+set -e
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+PATCH_DIR=${PATCH_DIR:-$(realpath "$SCRIPT_DIR/../patches/")}
+
+CUDF_DIR=${CUDF_DIR:-$(realpath "$SCRIPT_DIR/../thirdparty/cudf/")}
+
+
+pushd "$CUDF_DIR"
+if [ -n "$(git status --porcelain --untracked-files=no)" ] ; then
+  #only try to remove patches if it looks like something was changed
+  find "$PATCH_DIR" -maxdepth 1 -type f -print0 | sort -zV -r | while IFS= read -r -d '' file; do
+      echo "patching with: $file"
+      patch -R --no-backup-if-mismatch --reject-file=- -f -t -p1 -i "$file"
+  done
+fi
+
+# Check for modifications
+if [ -n "$(git status --porcelain --untracked-files=no)" ] ; then
+    echo "Error: CUDF repository has uncommitted changes. You might want to clean in manually if you know that is expected"
+    exit 1
+fi
+popd
diff --git a/ci/submodule-sync.sh b/ci/submodule-sync.sh
index a889d86eb0..29b0cf5dad 100755
--- a/ci/submodule-sync.sh
+++ b/ci/submodule-sync.sh
@@ -70,34 +70,32 @@ echo "Test against ${cudf_sha}..."
 
 MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 -B"
 set +e
-${MVN} verify ${MVN_MIRROR} \
+# Don't do a full build. Just try to update/build CUDF with no patches on top of it.
+${MVN} validate ${MVN_MIRROR} \
   -DCPP_PARALLEL_LEVEL=${PARALLEL_LEVEL} \
   -Dlibcudf.build.configure=true \
   -Dlibcudf.dependency.mode=latest \
+  -Dsubmodule.patch.skip \
   -DUSE_GDS=ON -Dtest=*,!CuFileTest,!CudaFatalTest,!ColumnViewNonEmptyNullsTest \
   -DBUILD_TESTS=ON \
   -DUSE_SANITIZER=ON
-verify_status=$?
+validate_status=$?
 set -e
 
-test_pass="False"
-if [[ "${verify_status}" == "0" ]]; then
-  echo "Test passed, will try merge the change"
-  test_pass="True"
-else
-  echo "Test failed, will update the result"
-fi
-
-build_name=$(${MVN} help:evaluate -Dexpression=project.build.finalName -q -DforceStdout)
-cuda_version=$(${MVN} help:evaluate -Dexpression=cuda.version -q -DforceStdout)
-. ci/check-cuda-dependencies.sh "target/${build_name}-${cuda_version}.jar"
-
 LIBCUDF_BUILD_PATH=$(${MVN} help:evaluate -Dexpression=libcudf.build.path -q -DforceStdout)
 # Extract the rapids-cmake sha1 that we need to pin too
 rapids_cmake_sha=$(git -C ${LIBCUDF_BUILD_PATH}/_deps/rapids-cmake-src/ rev-parse HEAD)
 echo "Update rapids-cmake pinned SHA1 to ${rapids_cmake_sha}"
 echo "${rapids_cmake_sha}" > thirdparty/cudf-pins/rapids-cmake.sha
 
+# Bash the wrong nvcomp version to the correct version until
+# nvcomp version mismatch is fixed. https://github.com/rapidsai/cudf/issues/16772.
+echo "Revert nvcomp to 3.0.6"
+sed -i -e 's/4\.0\.1\.0/3.0.6/' \
+  -e 's|https://developer.download.nvidia.com/compute/nvcomp/${version}/local_installers/nvcomp-linux-sbsa-${version}-cuda${cuda-toolkit-version-mapping}.tar.gz|https://developer.download.nvidia.com/compute/nvcomp/${version}/local_installers/nvcomp_${version}_SBSA_${cuda-toolkit-version-mapping}.tgz|' \
+  -e 's|https://developer.download.nvidia.com/compute/nvcomp/${version}/local_installers/nvcomp-linux-x86_64-${version}-cuda${cuda-toolkit-version-mapping}.tar.gz|https://developer.download.nvidia.com/compute/nvcomp/${version}/local_installers/nvcomp_${version}_x86_64_${cuda-toolkit-version-mapping}.tgz|' \
+  thirdparty/cudf-pins/versions.json
+
 # Do the git add after the build so that we get
 # the updated versions.json generated by the build
 echo "Update cudf submodule to ${cudf_sha} with updated pinned versions"
@@ -105,6 +103,29 @@ git add .
 git diff-index --quiet HEAD || git commit -s -m "Update submodule cudf to ${cudf_sha}"
 sha=$(git rev-parse HEAD)
 
+set +e
+# now build and test everything with the patches in place
+${MVN} clean verify ${MVN_MIRROR} \
+  -DCPP_PARALLEL_LEVEL=${PARALLEL_LEVEL} \
+  -Dlibcudf.build.configure=true \
+  -DUSE_GDS=ON -Dtest=*,!CuFileTest,!CudaFatalTest,!ColumnViewNonEmptyNullsTest \
+  -DBUILD_TESTS=ON \
+  -DUSE_SANITIZER=ON
+verify_status=$?
+set -e
+
+test_pass="False"
+if [[ ( "${verify_status}" == "0" ) && ( "${validate_status}" == "0" ) ]]; then
+  echo "Test passed, will try merge the change"
+  test_pass="True"
+else
+  echo "Test failed, will update the result"
+fi
+
+build_name=$(${MVN} help:evaluate -Dexpression=project.build.finalName -q -DforceStdout)
+cuda_version=$(${MVN} help:evaluate -Dexpression=cuda.version -q -DforceStdout)
+. ci/check-cuda-dependencies.sh "target/${build_name}-${cuda_version}.jar"
+
 # push the intermediate branch and create PR against REF
 # if test passed, it will try auto-merge the PR
 # if test failed, it will only comment the test result in the PR
diff --git a/patches/revert_nvcomp4.patch b/patches/revert_nvcomp4.patch
new file mode 100644
index 0000000000..88b58b14dc
--- /dev/null
+++ b/patches/revert_nvcomp4.patch
@@ -0,0 +1,907 @@
+diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
+index fb93b06dbe..e5565c4b53 100755
+--- a/ci/build_wheel_cudf.sh
++++ b/ci/build_wheel_cudf.sh
+@@ -23,6 +23,8 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
+ python -m auditwheel repair \
+     --exclude libcudf.so \
+     --exclude libnvcomp.so \
++    --exclude libnvcomp_bitcomp.so \
++    --exclude libnvcomp_gdeflate.so \
+     -w ${package_dir}/final_dist \
+     ${package_dir}/dist/*
+ 
+diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
+index 5e9f7f8a0c..0e4745bda2 100755
+--- a/ci/build_wheel_pylibcudf.sh
++++ b/ci/build_wheel_pylibcudf.sh
+@@ -21,6 +21,8 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
+ python -m auditwheel repair \
+     --exclude libcudf.so \
+     --exclude libnvcomp.so \
++    --exclude libnvcomp_bitcomp.so \
++    --exclude libnvcomp_gdeflate.so \
+     -w ${package_dir}/final_dist \
+     ${package_dir}/dist/*
+ 
+diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
+index 5a05dfd053..e7363645d6 100644
+--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
++++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
+@@ -58,7 +58,7 @@ dependencies:
+ - numpy>=1.23,<3.0a0
+ - numpydoc
+ - nvcc_linux-64=11.8
+-- nvcomp==4.0.1
++- nvcomp==3.0.6
+ - nvtx>=0.2.1
+ - openpyxl
+ - packaging
+diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
+index 8490296233..3559a1a341 100644
+--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
++++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
+@@ -56,7 +56,7 @@ dependencies:
+ - numba>=0.57
+ - numpy>=1.23,<3.0a0
+ - numpydoc
+-- nvcomp==4.0.1
++- nvcomp==3.0.6
+ - nvtx>=0.2.1
+ - openpyxl
+ - packaging
+diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
+index dc75eb4b25..67d501d746 100644
+--- a/conda/recipes/libcudf/conda_build_config.yaml
++++ b/conda/recipes/libcudf/conda_build_config.yaml
+@@ -35,7 +35,7 @@ spdlog_version:
+   - ">=1.14.1,<1.15"
+ 
+ nvcomp_version:
+-  - "=4.0.1"
++  - "=3.0.6"
+ 
+ zlib_version:
+   - ">=1.2.13"
+diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
+index 0d74a4158a..f3260d0cb5 100644
+--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
++++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
+@@ -24,7 +24,7 @@
+ namespace CUDF_EXPORT cudf {
+ namespace io::nvcomp {
+ 
+-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP };
++enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+ 
+ /**
+  * @brief Set of parameters that impact whether nvCOMP features are enabled.
+@@ -36,20 +36,33 @@ struct feature_status_parameters {
+   int lib_patch_version;                 ///< patch version
+   bool are_all_integrations_enabled;     ///< all integrations
+   bool are_stable_integrations_enabled;  ///< stable integrations
++  int compute_capability_major;          ///< cuda compute major version
+ 
+   /**
+-   * @brief Default constructor using the current version of nvcomp and current environment
+-   * variables
++   * @brief Default Constructor
+    */
+   feature_status_parameters();
+ 
+   /**
+-   * @brief Constructor using the current version of nvcomp
++   * @brief feature_status_parameters Constructor
+    *
++   * @param major positive integer representing major value of nvcomp
++   * @param minor positive integer representing minor value of nvcomp
++   * @param patch positive integer representing patch value of nvcomp
+    * @param all_enabled if all integrations are enabled
+    * @param stable_enabled if stable integrations are enabled
++   * @param cc_major CUDA compute capability
+    */
+-  feature_status_parameters(bool all_enabled, bool stable_enabled);
++  feature_status_parameters(
++    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
++    : lib_major_version{major},
++      lib_minor_version{minor},
++      lib_patch_version{patch},
++      are_all_integrations_enabled{all_enabled},
++      are_stable_integrations_enabled{stable_enabled},
++      compute_capability_major{cc_major}
++  {
++  }
+ };
+ 
+ /**
+@@ -61,7 +74,8 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
+          lhs.lib_minor_version == rhs.lib_minor_version and
+          lhs.lib_patch_version == rhs.lib_patch_version and
+          lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
+-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled;
++         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
++         lhs.compute_capability_major == rhs.compute_capability_major;
+ }
+ 
+ /**
+diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
+index c3187f73a9..3191e8f015 100644
+--- a/cpp/src/io/comp/nvcomp_adapter.cpp
++++ b/cpp/src/io/comp/nvcomp_adapter.cpp
+@@ -22,46 +22,94 @@
+ #include <cudf/io/config_utils.hpp>
+ #include <cudf/utilities/error.hpp>
+ 
+-#include <nvcomp/deflate.h>
+-#include <nvcomp/gzip.h>
+ #include <nvcomp/lz4.h>
+ #include <nvcomp/snappy.h>
+-#include <nvcomp/zstd.h>
+ 
+ #include <mutex>
+ 
++#define NVCOMP_DEFLATE_HEADER <nvcomp/deflate.h>
++#if __has_include(NVCOMP_DEFLATE_HEADER)
++#include NVCOMP_DEFLATE_HEADER
++#endif
++
++#define NVCOMP_ZSTD_HEADER <nvcomp/zstd.h>
++#if __has_include(NVCOMP_ZSTD_HEADER)
++#include NVCOMP_ZSTD_HEADER
++#endif
++
++// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
++#ifndef NVCOMP_MAJOR_VERSION
++#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
++#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
++#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
++#endif
++
++#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
++
++#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
++
++#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5))
++
++#define NVCOMP_HAS_DECOMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \
++  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1))
++
++#define NVCOMP_HAS_COMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 6))
++
++// ZSTD is stable for nvcomp 2.3.2 or newer
++#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \
++  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2))
++
+ namespace cudf::io::nvcomp {
+ 
+ // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
+ template <typename... Args>
+-auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args)
++std::optional<nvcompStatus_t> batched_decompress_get_temp_size_ex(compression_type compression,
++                                                                  Args&&... args)
+ {
++#if NVCOMP_HAS_DECOMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+   switch (compression) {
+     case compression_type::SNAPPY:
+       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
+     case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
++#else
++      return std::nullopt;
++#endif
+     case compression_type::LZ4:
+       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
+-    case compression_type::DEFLATE:
+-      return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
+-    case compression_type::GZIP:
+-      return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
+-    default: CUDF_FAIL("Unsupported compression type");
++    case compression_type::DEFLATE: [[fallthrough]];
++    default: return std::nullopt;
+   }
++#endif
++  return std::nullopt;
+ }
+-size_t batched_decompress_temp_size(compression_type compression,
+-                                    size_t num_chunks,
+-                                    size_t max_uncomp_chunk_size,
+-                                    size_t max_total_uncomp_size)
+-{
+-  size_t temp_size             = 0;
+-  nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex(
+-    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
+ 
+-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+-               "Unable to get scratch size for decompression");
+-  return temp_size;
++// Dispatcher for nvcompBatched<format>DecompressGetTempSize
++template <typename... Args>
++auto batched_decompress_get_temp_size(compression_type compression, Args&&... args)
++{
++  switch (compression) {
++    case compression_type::SNAPPY:
++      return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
++    case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++      return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
++#else
++      CUDF_FAIL("Decompression error: " +
++                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
++    case compression_type::DEFLATE:
++#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++      return nvcompBatchedDeflateDecompressGetTempSize(std::forward<Args>(args)...);
++#else
++      CUDF_FAIL("Decompression error: " +
++                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
++#endif
++    case compression_type::LZ4:
++      return nvcompBatchedLZ4DecompressGetTempSize(std::forward<Args>(args)...);
++    default: CUDF_FAIL("Unsupported compression type");
++  }
+ }
+ 
+ // Dispatcher for nvcompBatched<format>DecompressAsync
+@@ -72,12 +120,20 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
+     case compression_type::SNAPPY:
+       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
+     case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
++#else
++      CUDF_FAIL("Decompression error: " +
++                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
+     case compression_type::DEFLATE:
++#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
++#else
++      CUDF_FAIL("Decompression error: " +
++                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
++#endif
+     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
+-    case compression_type::GZIP:
+-      return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
+     default: CUDF_FAIL("Unsupported compression type");
+   }
+ }
+@@ -89,11 +145,31 @@ std::string compression_type_name(compression_type compression)
+     case compression_type::ZSTD: return "Zstandard";
+     case compression_type::DEFLATE: return "Deflate";
+     case compression_type::LZ4: return "LZ4";
+-    case compression_type::GZIP: return "GZIP";
+   }
+   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
+ }
+ 
++size_t batched_decompress_temp_size(compression_type compression,
++                                    size_t num_chunks,
++                                    size_t max_uncomp_chunk_size,
++                                    size_t max_total_uncomp_size)
++{
++  size_t temp_size   = 0;
++  auto nvcomp_status = batched_decompress_get_temp_size_ex(
++    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
++
++  if (nvcomp_status.value_or(nvcompStatus_t::nvcompErrorInternal) !=
++      nvcompStatus_t::nvcompSuccess) {
++    nvcomp_status =
++      batched_decompress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size, &temp_size);
++  }
++
++  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
++               "Unable to get scratch size for decompression");
++
++  return temp_size;
++}
++
+ void batched_decompress(compression_type compression,
+                         device_span<device_span<uint8_t const> const> inputs,
+                         device_span<device_span<uint8_t> const> outputs,
+@@ -128,10 +204,54 @@ void batched_decompress(compression_type compression,
+   update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
+ }
+ 
+-size_t batched_compress_temp_size(compression_type compression,
+-                                  size_t batch_size,
+-                                  size_t max_uncompressed_chunk_bytes,
+-                                  size_t max_total_uncompressed_bytes)
++// Wrapper for nvcompBatched<format>CompressGetTempSize
++auto batched_compress_get_temp_size(compression_type compression,
++                                    size_t batch_size,
++                                    size_t max_uncompressed_chunk_bytes)
++{
++  size_t temp_size             = 0;
++  nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
++  switch (compression) {
++    case compression_type::SNAPPY:
++      nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
++        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size);
++      break;
++    case compression_type::DEFLATE:
++#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++      nvcomp_status = nvcompBatchedDeflateCompressGetTempSize(
++        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size);
++      break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
++#endif
++    case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++      nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
++        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
++      break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
++    case compression_type::LZ4:
++      nvcomp_status = nvcompBatchedLZ4CompressGetTempSize(
++        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedLZ4DefaultOpts, &temp_size);
++      break;
++    default: CUDF_FAIL("Unsupported compression type");
++  }
++
++  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
++               "Unable to get scratch size for compression");
++  return temp_size;
++}
++
++#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++// Wrapper for nvcompBatched<format>CompressGetTempSizeEx
++auto batched_compress_get_temp_size_ex(compression_type compression,
++                                       size_t batch_size,
++                                       size_t max_uncompressed_chunk_bytes,
++                                       size_t max_total_uncompressed_bytes)
+ {
+   size_t temp_size             = 0;
+   nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
+@@ -171,8 +291,28 @@ size_t batched_compress_temp_size(compression_type compression,
+                "Unable to get scratch size for compression");
+   return temp_size;
+ }
++#endif
++
++size_t batched_compress_temp_size(compression_type compression,
++                                  size_t num_chunks,
++                                  size_t max_uncomp_chunk_size,
++                                  size_t max_total_uncomp_size)
++{
++#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++  try {
++    return batched_compress_get_temp_size_ex(
++      compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
++  } catch (...) {
++    // Ignore errors in the expanded version; fall back to the old API in case of failure
++    CUDF_LOG_WARN(
++      "CompressGetTempSizeEx call failed, falling back to CompressGetTempSize; this may increase "
++      "the memory usage");
++  }
++#endif
++
++  return batched_compress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size);
++}
+ 
+-// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
+ size_t compress_max_output_chunk_size(compression_type compression,
+                                       uint32_t max_uncompressed_chunk_bytes)
+ {
+@@ -188,13 +328,23 @@ size_t compress_max_output_chunk_size(compression_type compression,
+         capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
+       break;
+     case compression_type::DEFLATE:
++#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
+         capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
+       break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
++#endif
+     case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+         capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
+       break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
+     case compression_type::LZ4:
+       status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+         capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
+@@ -234,6 +384,7 @@ static void batched_compress_async(compression_type compression,
+                                                        stream.value());
+       break;
+     case compression_type::DEFLATE:
++#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs,
+                                                         device_uncompressed_bytes,
+                                                         max_uncompressed_chunk_bytes,
+@@ -245,7 +396,12 @@ static void batched_compress_async(compression_type compression,
+                                                         nvcompBatchedDeflateDefaultOpts,
+                                                         stream.value());
+       break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
++#endif
+     case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
+       nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
+                                                      device_uncompressed_bytes,
+                                                      max_uncompressed_chunk_bytes,
+@@ -257,6 +413,10 @@ static void batched_compress_async(compression_type compression,
+                                                      nvcompBatchedZstdDefaultOpts,
+                                                      stream.value());
+       break;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
+     case compression_type::LZ4:
+       nvcomp_status = nvcompBatchedLZ4CompressAsync(device_uncompressed_ptrs,
+                                                     device_uncompressed_bytes,
+@@ -318,18 +478,16 @@ void batched_compress(compression_type compression,
+ }
+ 
+ feature_status_parameters::feature_status_parameters()
+-  : feature_status_parameters(nvcomp_integration::is_all_enabled(),
+-                              nvcomp_integration::is_stable_enabled())
+-{
+-}
+-
+-feature_status_parameters::feature_status_parameters(bool all_enabled, bool stable_enabled)
+-  : lib_major_version{NVCOMP_VER_MAJOR},
+-    lib_minor_version{NVCOMP_VER_MINOR},
+-    lib_patch_version{NVCOMP_VER_PATCH},
+-    are_all_integrations_enabled{all_enabled},
+-    are_stable_integrations_enabled{stable_enabled}
++  : lib_major_version{NVCOMP_MAJOR_VERSION},
++    lib_minor_version{NVCOMP_MINOR_VERSION},
++    lib_patch_version{NVCOMP_PATCH_VERSION},
++    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
++    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
+ {
++  int device;
++  CUDF_CUDA_TRY(cudaGetDevice(&device));
++  CUDF_CUDA_TRY(
++    cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device));
+ }
+ 
+ // Represents all parameters required to determine status of a compression/decompression feature
+@@ -352,21 +510,42 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
+ {
+   switch (compression) {
+     case compression_type::DEFLATE: {
++      if (not NVCOMP_HAS_DEFLATE(
++            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
++        return "nvCOMP 2.5 or newer is required for Deflate compression";
++      }
+       if (not params.are_all_integrations_enabled) {
+         return "DEFLATE compression is experimental, you can enable it through "
+                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+       }
+       return std::nullopt;
+     }
++    case compression_type::SNAPPY: {
++      if (not params.are_stable_integrations_enabled) {
++        return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
++               "environment variable.";
++      }
++      return std::nullopt;
++    }
++    case compression_type::ZSTD: {
++      if (not NVCOMP_HAS_ZSTD_COMP(
++            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
++        return "nvCOMP 2.4 or newer is required for Zstandard compression";
++      }
++      if (not params.are_stable_integrations_enabled) {
++        return "Zstandard compression is experimental, you can enable it through "
++               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
++      }
++      return std::nullopt;
++    }
+     case compression_type::LZ4:
+-    case compression_type::SNAPPY:
+-    case compression_type::ZSTD:
+       if (not params.are_stable_integrations_enabled) {
+-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
++        return "LZ4 compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
++               "environment variable.";
+       }
+       return std::nullopt;
+-    default: return "Unsupported compression type";
+   }
++  return "Unsupported compression type";
+ }
+ 
+ std::optional<std::string> is_compression_disabled(compression_type compression,
+@@ -398,26 +577,58 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
+   return reason;
+ }
+ 
++std::optional<std::string> is_zstd_decomp_disabled(feature_status_parameters const& params)
++{
++  if (not NVCOMP_HAS_ZSTD_DECOMP(
++        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
++    return "nvCOMP 2.3 or newer is required for Zstandard decompression";
++  }
++
++  if (NVCOMP_ZSTD_DECOMP_IS_STABLE(
++        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
++    if (not params.are_stable_integrations_enabled) {
++      return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
++             "environment variable.";
++    }
++  } else if (not params.are_all_integrations_enabled) {
++    return "Zstandard decompression is experimental, you can enable it through "
++           "`LIBCUDF_NVCOMP_POLICY` environment variable.";
++  }
++
++  return std::nullopt;
++}
++
+ std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
+                                                           feature_status_parameters params)
+ {
+   switch (compression) {
+-    case compression_type::DEFLATE:
+-    case compression_type::GZIP: {
++    case compression_type::DEFLATE: {
++      if (not NVCOMP_HAS_DEFLATE(
++            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
++        return "nvCOMP 2.5 or newer is required for Deflate decompression";
++      }
+       if (not params.are_all_integrations_enabled) {
+         return "DEFLATE decompression is experimental, you can enable it through "
+                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+       }
+       return std::nullopt;
+     }
+-    case compression_type::LZ4:
+-    case compression_type::SNAPPY:
+-    case compression_type::ZSTD: {
++    case compression_type::SNAPPY: {
+       if (not params.are_stable_integrations_enabled) {
+-        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
++        return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
++               "environment variable.";
+       }
+       return std::nullopt;
+     }
++    case compression_type::ZSTD: return is_zstd_decomp_disabled(params);
++    case compression_type::LZ4: {
++      if (not params.are_stable_integrations_enabled) {
++        return "LZ4 decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
++               "environment variable.";
++      }
++      return std::nullopt;
++    }
++    default: return "Unsupported compression type";
+   }
+   return "Unsupported compression type";
+ }
+@@ -451,27 +662,43 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
+   return reason;
+ }
+ 
+-size_t required_alignment(compression_type compression)
++size_t compress_input_alignment_bits(compression_type compression)
+ {
+   switch (compression) {
+-    case compression_type::GZIP:
+-    case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
+-    case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
+-    case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
+-    case compression_type::LZ4: return nvcompLZ4RequiredAlignment;
++    case compression_type::DEFLATE: return 0;
++    case compression_type::SNAPPY: return 0;
++    case compression_type::ZSTD: return 2;
++    case compression_type::LZ4: return 2;
+     default: CUDF_FAIL("Unsupported compression type");
+   }
+ }
+ 
+-std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
++size_t compress_output_alignment_bits(compression_type compression)
+ {
+   switch (compression) {
+-    case compression_type::DEFLATE: return nvcompDeflateCompressionMaxAllowedChunkSize;
+-    case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
+-    case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
+-    case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
++    case compression_type::DEFLATE: return 3;
++    case compression_type::SNAPPY: return 0;
++    case compression_type::ZSTD: return 0;
++    case compression_type::LZ4: return 2;
+     default: CUDF_FAIL("Unsupported compression type");
+   }
+ }
+ 
++std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
++{
++  switch (compression) {
++    case compression_type::DEFLATE: return 64 * 1024;
++    case compression_type::SNAPPY: return std::nullopt;
++    case compression_type::ZSTD:
++#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
++      return nvcompZstdCompressionMaxAllowedChunkSize;
++#else
++      CUDF_FAIL("Compression error: " +
++                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
++#endif
++    case compression_type::LZ4: return 16 * 1024 * 1024;
++    default: return std::nullopt;
++   }
++}
++
+ }  // namespace cudf::io::nvcomp
+diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
+index 583bd6a352..43c79e3237 100644
+--- a/cpp/src/io/comp/nvcomp_adapter.hpp
++++ b/cpp/src/io/comp/nvcomp_adapter.hpp
+@@ -75,12 +75,20 @@ size_t batched_decompress_temp_size(compression_type compression,
+                                                     uint32_t max_uncomp_chunk_size);
+ 
+ /**
+- * @brief Gets input and output alignment requirements for the given compression type.
++ * @brief Gets input alignment requirements for the given compression type.
+  *
+  * @param compression Compression type
+- * @returns required alignment
++ * @returns required alignment, in bits
+  */
+-[[nodiscard]] size_t required_alignment(compression_type compression);
++[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression);
++
++/**
++ * @brief Gets output alignment requirements for the given compression type.
++ *
++ * @param compression Compression type
++ * @returns required alignment, in bits
++ */
++[[nodiscard]] size_t compress_output_alignment_bits(compression_type compression);
+ 
+ /**
+  * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP.
+diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
+index 60a64fb0ee..40cfbe763b 100644
+--- a/cpp/src/io/orc/writer_impl.cu
++++ b/cpp/src/io/orc/writer_impl.cu
+@@ -533,20 +533,20 @@ auto uncomp_block_alignment(CompressionKind compression_kind)
+ {
+   if (compression_kind == NONE or
+       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
+-    return 1ul;
++    return 1u;
+   }
+ 
+-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
++  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind));
+ }
+ 
+ auto comp_block_alignment(CompressionKind compression_kind)
+ {
+   if (compression_kind == NONE or
+       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
+-    return 1ul;
++    return 1u;
+   }
+ 
+-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
++  return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
+ }
+ 
+ /**
+diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
+index c588fedb85..bab70c126b 100644
+--- a/cpp/src/io/parquet/reader_impl_chunking.cu
++++ b/cpp/src/io/parquet/reader_impl_chunking.cu
+@@ -865,18 +865,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
+ 
+     switch (codec.compression_type) {
+       case GZIP:
+-        if (cudf::io::nvcomp_integration::is_all_enabled()) {
+-          nvcomp::batched_decompress(nvcomp::compression_type::GZIP,
+-                                     d_comp_in_view,
+-                                     d_comp_out_view,
+-                                     d_comp_res_view,
+-                                     codec.max_decompressed_size,
+-                                     codec.total_decomp_size,
+-                                     stream);
+-        } else {
+-          gpuinflate(
+-            d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+-        }
++        gpuinflate(
++          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+         break;
+       case SNAPPY:
+         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
+diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
+index 396d44c076..e2f09f872d 100644
+--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
++++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
+@@ -62,7 +62,7 @@ uint32_t page_alignment(Compression codec)
+     return 1u;
+   }
+ 
+-  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
++  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+ }
+ 
+ size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
+index 840cf263ed..38c1a57eca 100644
+--- a/cpp/tests/io/comp/decomp_test.cpp
++++ b/cpp/tests/io/comp/decomp_test.cpp
+@@ -176,19 +176,23 @@ TEST_F(NvcompConfigTest, Compression)
+   using cudf::io::nvcomp::compression_type;
+   auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
+ 
+-  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true}));
++  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 0}));
++  // version 2.5 required
++  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 0}));
+   // all integrations enabled required
+-  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {false, true}));
++  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 0}));
+ 
+-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {true, true}));
+-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {false, true}));
++  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 0}));
++  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, true, 0}));
++  // 2.4 version required
++  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 0}));
+   // stable integrations enabled required
+-  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {false, false}));
++  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 0}));
+ 
+-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {true, true}));
+-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {false, true}));
++  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 5, 0, true, true, 0}));
++  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 4, 0, false, true, 0}));
+   // stable integrations enabled required
+-  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {false, false}));
++  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {2, 3, 0, false, false, 0}));
+ }
+ 
+ TEST_F(NvcompConfigTest, Decompression)
+@@ -196,19 +200,27 @@ TEST_F(NvcompConfigTest, Decompression)
+   using cudf::io::nvcomp::compression_type;
+   auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
+ 
+-  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true}));
++  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 7}));
++  // version 2.5 required
++  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 7}));
+   // all integrations enabled required
+-  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {false, true}));
+-
+-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {true, true}));
+-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {false, true}));
++  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 7}));
++
++  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 7}));
++  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 2, false, true, 6}));
++  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 0, true, true, 6}));
++  // 2.3.1 and earlier requires all integrations to be enabled
++  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 7}));
++  // 2.3 version required
++  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 2, 0, true, true, 7}));
+   // stable integrations enabled required
+-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {false, false}));
++  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 7}));
+ 
+-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {true, true}));
+-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {false, true}));
++  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 4, 0, true, true, 7}));
++  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 3, 0, false, true, 7}));
++  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, true, 7}));
+   // stable integrations enabled required
+-  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {false, false}));
++  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, false, 7}));
+ }
+ 
+ CUDF_TEST_PROGRAM_MAIN()
+diff --git a/dependencies.yaml b/dependencies.yaml
+index 6909eb7168..2fa58a31b1 100644
+--- a/dependencies.yaml
++++ b/dependencies.yaml
+@@ -368,7 +368,7 @@ dependencies:
+           - flatbuffers==24.3.25
+           - librdkafka>=2.5.0,<2.6.0a0
+           # Align nvcomp version with rapids-cmake
+-          - nvcomp==4.0.1
++          - nvcomp==3.0.6
+           - spdlog>=1.14.1,<1.15
+   rapids_build_skbuild:
+     common:
+diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
+index 97b961b455..adcdaa51e7 100644
+--- a/docs/cudf/source/user_guide/io/io.md
++++ b/docs/cudf/source/user_guide/io/io.md
+@@ -75,6 +75,7 @@ IO format.
+ 
+ </div>
+ 
++
+ **Notes:**
+ 
+ - \[¹\] - Not all orientations are GPU-accelerated.
+@@ -176,9 +177,4 @@ If no value is set, behavior will be the same as the "STABLE" option.
+     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
+     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+-    | LZ4                   | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
+-    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+-    | GZIP                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | ❌           | ❌           | ❌     |
+-    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+-
+ ```
+diff --git a/java/pom.xml b/java/pom.xml
+index e4f1cdf64e..9694e741f1 100644
+--- a/java/pom.xml
++++ b/java/pom.xml
+@@ -1,6 +1,6 @@
+ <?xml version="1.0" encoding="UTF-8"?>
+ <!--
+-  Copyright (c) 2019-2024, NVIDIA CORPORATION.
++  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ 
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+@@ -590,6 +590,8 @@
+                                         <include>libcudfjni.so</include>
+                                         <include>libcufilejni.so</include>
+                                         <include>libnvcomp.so</include>
++                                        <include>libnvcomp_gdeflate.so</include>
++                                        <include>libnvcomp_bitcomp.so</include>
+                                     </includes>
+                                 </resource>
+                                 <resource>
+diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+index 58182c3e62..7ee590e3c8 100755
+--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
++++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+@@ -54,6 +54,9 @@ public class NativeDepsLoader {
+    * subsequent stages are loaded.
+    */
+   private static final String[][] loadOrder = new String[][]{
++      new String[]{
++          "nvcomp_bitcomp", "nvcomp_gdeflate"
++      },
+       new String[]{
+           "nvcomp"
+       },
+diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
+index 32045f3c50..c18a90140b 100644
+--- a/java/src/main/native/CMakeLists.txt
++++ b/java/src/main/native/CMakeLists.txt
+@@ -267,8 +267,9 @@ if(TARGET nvcomp::nvcomp)
+   add_custom_command(
+     TARGET cudfjni
+     PRE_LINK
+-    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp>
+-            "${PROJECT_BINARY_DIR}/libnvcomp.so"
++    COMMAND
++      ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp> $<TARGET_FILE:nvcomp::nvcomp_gdeflate>
++      $<TARGET_FILE:nvcomp::nvcomp_bitcomp> "${PROJECT_BINARY_DIR}"
+     COMMENT "Copying nvcomp libraries to ${PROJECT_BINARY_DIR}"
+   )
+ endif()
+diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
+index 0a8f5c4807..96eb6c3bb3 100644
+--- a/python/libcudf/CMakeLists.txt
++++ b/python/libcudf/CMakeLists.txt
+@@ -48,5 +48,6 @@ add_subdirectory(../../cpp cudf-cpp)
+ # Ensure other libraries needed by libcudf.so get installed alongside it.
+ include(cmake/Modules/WheelHelpers.cmake)
+ install_aliased_imported_targets(
+-  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
++  TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
++  ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+ )
diff --git a/pom.xml b/pom.xml
index 70920a5316..1028b72574 100644
--- a/pom.xml
+++ b/pom.xml
@@ -21,7 +21,7 @@
 
   <groupId>com.nvidia</groupId>
   <artifactId>spark-rapids-jni</artifactId>
-  <version>24.08.0</version>
+  <version>24.10.0</version>
   <packaging>jar</packaging>
   <name>RAPIDS Accelerator JNI for Apache Spark</name>
   <description>
@@ -110,6 +110,7 @@
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <slf4j.version>1.7.30</slf4j.version>
     <submodule.check.skip>false</submodule.check.skip>
+    <submodule.patch.skip>false</submodule.patch.skip>
     <antrun.version>3.0.0</antrun.version>
     <hilbert.version>0.2.2</hilbert.version>
   </properties>
@@ -392,6 +393,38 @@
               <goal>run</goal>
             </goals>
           </execution>
+          <execution>
+            <id>cudf patch</id>
+            <phase>validate</phase>
+            <configuration>
+              <skip>${submodule.patch.skip}</skip>
+              <target>
+                <exec dir="${project.basedir}"
+                      failonerror="true"
+                      executable="${project.basedir}/build/apply-patches">
+                </exec>
+              </target>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>cudf un-patch</id>
+            <phase>clean</phase>
+            <configuration>
+              <skip>${submodule.patch.skip}</skip>
+              <target>
+                <exec dir="${project.basedir}"
+                      failonerror="true"
+                      executable="${project.basedir}/build/unapply-patches">
+                </exec>
+              </target>
+            </configuration>
+            <goals>
+              <goal>run</goal>
+            </goals>
+          </execution>
           <execution>
             <id>build-libcudf</id>
             <phase>validate</phase>
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 72220e9360..3ee308550f 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -51,7 +51,7 @@ rapids_cuda_init_architectures(SPARK_RAPIDS_JNI)
 
 project(
   SPARK_RAPIDS_JNI
-  VERSION 24.08.00
+  VERSION 24.10.00
   LANGUAGES C CXX CUDA
 )
 
@@ -153,28 +153,32 @@ find_library(CUDFJNI_LIB "libcudfjni.a" REQUIRED NO_DEFAULT_PATH
   HINTS "${CUDFJNI_BUILD_DIR}"
 )
 
+# arrow
+find_library(ARROW_LIB "libarrow.a" REQUIRED NO_DEFAULT_PATH
+  HINTS "${CUDFJNI_BUILD_DIR}/_deps/arrow-build/release/"
+)
+
 # parquet
 find_library(PARQUET_LIB "libparquet.a" REQUIRED NO_DEFAULT_PATH
-  HINTS "${CUDF_INSTALL_DIR}/lib64"
-  HINTS "${CUDF_INSTALL_DIR}/lib"
+  HINTS "${CUDFJNI_BUILD_DIR}/_deps/arrow-build/release/"
 )
 
 # Internal parquet headers
 set (GENERATED_PARQUET_INCLUDE
-    "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src/"
+    "${CUDFJNI_BUILD_DIR}/_deps/arrow-src/cpp/src/"
     CACHE STRING "generated parquet thrift headers"
 )
 
 # thrift
 find_library(THRIFT_LIB "libthrift.a" REQUIRED NO_DEFAULT_PATH
-    HINTS "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/thrift_ep-install/lib/"
+    HINTS "${CUDFJNI_BUILD_DIR}/_deps/arrow-build/thrift_ep-install/lib/"
 )
 
 set(CUDFJNI_INCLUDE_DIRS
   "${CUDF_DIR}/java/src/main/native/include"
   "${CUDF_DIR}/java/src/main/native/src"
   "${GENERATED_PARQUET_INCLUDE}"
-  "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/thrift_ep-install/include/"
+  "${CUDFJNI_BUILD_DIR}/_deps/arrow-build/thrift_ep-install/include/"
 )
 
 # ##################################################################################################
@@ -191,7 +195,6 @@ add_library(
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/JSONUtilsJni.cpp
-  src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
   src/RegexRewriteUtilsJni.cpp
@@ -208,9 +211,9 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
+  src/from_json_to_raw_map.cu
   src/get_json_object.cu
   src/histogram.cu
-  src/map_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
   src/regex_rewrite_utils.cu
@@ -271,6 +274,7 @@ target_link_libraries(
     cudf::cudf
     nvtx3::nvtx3-cpp
   -Wl,--no-whole-archive
+    ${ARROW_LIB}
     ${PARQUET_LIB}
     ${THRIFT_LIB}
 )
diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu
index 51f9299dba..69deabe590 100644
--- a/src/main/cpp/benchmarks/get_json_object.cu
+++ b/src/main/cpp/benchmarks/get_json_object.cu
@@ -141,7 +141,7 @@ void BM_get_json_object(nvbench::state& state)
   auto const json_strings = generate_input(size_bytes, max_depth);
 
   using path_instruction_type = spark_rapids_jni::path_instruction_type;
-  std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
+  std::vector<std::tuple<path_instruction_type, std::string, int32_t>> instructions;
   instructions.emplace_back(path_instruction_type::NAMED, "struct", -1);
   for (int i = 0; i < max_depth - list_depth; ++i) {
     instructions.emplace_back(path_instruction_type::NAMED, "0", -1);
diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp
index 0da20f53f9..5a0c5dd341 100644
--- a/src/main/cpp/src/JSONUtilsJni.cpp
+++ b/src/main/cpp/src/JSONUtilsJni.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "cudf_jni_apis.hpp"
+#include "from_json.hpp"
 #include "get_json_object.hpp"
 
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,42 +26,49 @@ using path_instruction_type = spark_rapids_jni::path_instruction_type;
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject(
-  JNIEnv* env, jclass, jlong input_column, jobjectArray path_instructions)
+JNIEXPORT jint JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getMaxJSONPathDepth(JNIEnv* env,
+                                                                                      jclass)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    return spark_rapids_jni::MAX_JSON_PATH_DEPTH;
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject(JNIEnv* env,
+                                                         jclass,
+                                                         jlong input_column,
+                                                         jbyteArray j_type_nums,
+                                                         jobjectArray j_names,
+                                                         jintArray j_indexes)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
-  JNI_NULL_CHECK(env, path_instructions, "path_instructions is null", 0);
+  JNI_NULL_CHECK(env, j_type_nums, "j_type_nums is null", 0);
+  JNI_NULL_CHECK(env, j_names, "j_names is null", 0);
+  JNI_NULL_CHECK(env, j_indexes, "j_indexes is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto const n_column_view      = reinterpret_cast<cudf::column_view const*>(input_column);
     auto const n_strings_col_view = cudf::strings_column_view{*n_column_view};
 
-    std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
-    int size = env->GetArrayLength(path_instructions);
-    for (int i = 0; i < size; i++) {
-      jobject instruction = env->GetObjectArrayElement(path_instructions, i);
-      JNI_NULL_CHECK(env, instruction, "path_instruction is null", 0);
-      jclass instruction_class = env->GetObjectClass(instruction);
-      JNI_NULL_CHECK(env, instruction_class, "instruction_class is null", 0);
-
-      jfieldID field_id = env->GetFieldID(instruction_class, "type", "I");
-      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-      jint type                              = env->GetIntField(instruction, field_id);
-      path_instruction_type instruction_type = static_cast<path_instruction_type>(type);
-
-      field_id = env->GetFieldID(instruction_class, "name", "Ljava/lang/String;");
-      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-      jstring name = (jstring)env->GetObjectField(instruction, field_id);
-      JNI_NULL_CHECK(env, name, "name is null", 0);
-      const char* name_str = env->GetStringUTFChars(name, JNI_FALSE);
-
-      field_id = env->GetFieldID(instruction_class, "index", "J");
-      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-      jlong index = env->GetLongField(instruction, field_id);
+    std::vector<std::tuple<path_instruction_type, std::string, int32_t>> instructions;
 
-      instructions.emplace_back(instruction_type, name_str, index);
+    auto const type_nums = cudf::jni::native_jbyteArray(env, j_type_nums).to_vector();
+    auto const names     = cudf::jni::native_jstringArray(env, j_names);
+    auto const indexes   = cudf::jni::native_jintArray(env, j_indexes).to_vector();
+    int size             = type_nums.size();
+    if (names.size() != size || indexes.size() != static_cast<std::size_t>(size) ||
+        type_nums.size() != static_cast<std::size_t>(size)) {
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "wrong number of entries passed in", 0);
+    }
 
-      env->ReleaseStringUTFChars(name, name_str);
+    for (int i = 0; i < size; i++) {
+      path_instruction_type instruction_type = static_cast<path_instruction_type>(type_nums[i]);
+      const char* name_str                   = names[i].get();
+      jlong index                            = indexes[i];
+      instructions.emplace_back(instruction_type, name_str, index);
     }
 
     return cudf::jni::release_as_jlong(
@@ -69,58 +77,60 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObjectMultiplePaths(
-  JNIEnv* env, jclass, jlong j_input, jobjectArray j_paths, jintArray j_path_offsets)
+JNIEXPORT jlongArray JNICALL
+Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObjectMultiplePaths(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong j_input,
+                                                                      jbyteArray j_type_nums,
+                                                                      jobjectArray j_names,
+                                                                      jintArray j_indexes,
+                                                                      jintArray j_path_offsets,
+                                                                      jlong memory_budget_bytes,
+                                                                      jint parallel_override)
 {
   JNI_NULL_CHECK(env, j_input, "j_input column is null", 0);
-  JNI_NULL_CHECK(env, j_paths, "j_paths is null", 0);
+  JNI_NULL_CHECK(env, j_type_nums, "j_type_nums is null", 0);
+  JNI_NULL_CHECK(env, j_names, "j_names is null", 0);
+  JNI_NULL_CHECK(env, j_indexes, "j_indexes is null", 0);
   JNI_NULL_CHECK(env, j_path_offsets, "j_path_offsets is null", 0);
 
-  using path_type = std::vector<std::tuple<path_instruction_type, std::string, int64_t>>;
+  using path_type = std::vector<std::tuple<path_instruction_type, std::string, int32_t>>;
 
   try {
     cudf::jni::auto_set_device(env);
 
     auto const path_offsets = cudf::jni::native_jintArray(env, j_path_offsets).to_vector();
     CUDF_EXPECTS(path_offsets.size() > 1, "Invalid path offsets.");
+    auto const type_nums = cudf::jni::native_jbyteArray(env, j_type_nums).to_vector();
+    auto const names     = cudf::jni::native_jstringArray(env, j_names);
+    auto const indexes   = cudf::jni::native_jintArray(env, j_indexes).to_vector();
     auto const num_paths = path_offsets.size() - 1;
     std::vector<path_type> paths(num_paths);
+    auto const num_entries = path_offsets[num_paths];
+
+    if (num_entries < 0 || names.size() != num_entries ||
+        indexes.size() != static_cast<std::size_t>(num_entries) ||
+        type_nums.size() != static_cast<std::size_t>(num_entries)) {
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "wrong number of entries passed in", 0);
+    }
 
     for (std::size_t i = 0; i < num_paths; ++i) {
       auto const path_size = path_offsets[i + 1] - path_offsets[i];
       auto path            = path_type{};
       path.reserve(path_size);
       for (int j = path_offsets[i]; j < path_offsets[i + 1]; ++j) {
-        jobject instruction = env->GetObjectArrayElement(j_paths, j);
-        JNI_NULL_CHECK(env, instruction, "path_instruction is null", 0);
-        jclass instruction_class = env->GetObjectClass(instruction);
-        JNI_NULL_CHECK(env, instruction_class, "instruction_class is null", 0);
-
-        jfieldID field_id = env->GetFieldID(instruction_class, "type", "I");
-        JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-        jint type                              = env->GetIntField(instruction, field_id);
-        path_instruction_type instruction_type = static_cast<path_instruction_type>(type);
-
-        field_id = env->GetFieldID(instruction_class, "name", "Ljava/lang/String;");
-        JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-        jstring name = (jstring)env->GetObjectField(instruction, field_id);
-        JNI_NULL_CHECK(env, name, "name is null", 0);
-        const char* name_str = env->GetStringUTFChars(name, JNI_FALSE);
-
-        field_id = env->GetFieldID(instruction_class, "index", "J");
-        JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
-        jlong index = env->GetLongField(instruction, field_id);
-
+        path_instruction_type instruction_type = static_cast<path_instruction_type>(type_nums[j]);
+        const char* name_str                   = names[j].get();
+        jlong index                            = indexes[j];
         path.emplace_back(instruction_type, name_str, index);
-        env->ReleaseStringUTFChars(name, name_str);
       }
 
       paths[i] = std::move(path);
     }
 
     auto const input_cv = reinterpret_cast<cudf::column_view const*>(j_input);
-    auto output =
-      spark_rapids_jni::get_json_object_multiple_paths(cudf::strings_column_view{*input_cv}, paths);
+    auto output         = spark_rapids_jni::get_json_object_multiple_paths(
+      cudf::strings_column_view{*input_cv}, paths, memory_budget_bytes, parallel_override);
 
     auto out_handles = cudf::jni::native_jlongArray(env, output.size());
     std::transform(output.begin(), output.end(), out_handles.begin(), [](auto& col) {
@@ -130,4 +140,18 @@ JNIEXPORT jlongArray JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonO
   }
   CATCH_STD(env, 0);
 }
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_extractRawMapFromJsonString(
+  JNIEnv* env, jclass, jlong j_input)
+{
+  JNI_NULL_CHECK(env, j_input, "j_input is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(j_input);
+    return cudf::jni::ptr_as_jlong(
+      spark_rapids_jni::from_json_to_raw_map(cudf::strings_column_view{*input_cv}).release());
+  }
+  CATCH_STD(env, 0);
+}
 }
diff --git a/src/main/cpp/src/MapUtilsJni.cpp b/src/main/cpp/src/MapUtilsJni.cpp
deleted file mode 100644
index 0fc5f3c280..0000000000
--- a/src/main/cpp/src/MapUtilsJni.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "map_utils.hpp"
-
-#include <cudf_jni_apis.hpp>
-#include <dtype_utils.hpp>
-
-extern "C" {
-
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_extractRawMapFromJsonString(
-  JNIEnv* env, jclass, jlong input_handle)
-{
-  JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const*>(input_handle);
-    return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json(*input).release());
-  }
-  CATCH_STD(env, 0);
-}
-}
diff --git a/src/main/cpp/src/SparkResourceAdaptorJni.cpp b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
index b68cc16308..8eeb047ddc 100644
--- a/src/main/cpp/src/SparkResourceAdaptorJni.cpp
+++ b/src/main/cpp/src/SparkResourceAdaptorJni.cpp
@@ -24,6 +24,7 @@
 #include <spdlog/sinks/ostream_sink.h>
 #include <spdlog/spdlog.h>
 
+#include <algorithm>
 #include <chrono>
 #include <exception>
 #include <map>
@@ -203,6 +204,8 @@ struct task_metrics {
   // The amount of time that this thread has lost due to retries (not including blocked time)
   long time_lost_nanos = 0;
 
+  long gpu_max_memory_allocated = 0;
+
   void take_from(task_metrics& other)
   {
     add(other);
@@ -215,6 +218,8 @@ struct task_metrics {
     this->num_times_split_retry_throw += other.num_times_split_retry_throw;
     this->time_blocked_nanos += other.time_blocked_nanos;
     this->time_lost_nanos += other.time_lost_nanos;
+    this->gpu_max_memory_allocated =
+      std::max(this->gpu_max_memory_allocated, other.gpu_max_memory_allocated);
   }
 
   void clear()
@@ -295,6 +300,8 @@ class full_thread_state {
   // time)
   long time_retry_running_nanos = 0;
   std::chrono::time_point<std::chrono::steady_clock> block_start;
+  long gpu_memory_allocated_bytes = 0;
+
   // metrics for the current thread
   task_metrics metrics;
 
@@ -799,6 +806,11 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     return get_and_reset_metric(task_id, &task_metrics::time_lost_nanos);
   }
 
+  long get_and_reset_gpu_max_memory_allocated(long const task_id)
+  {
+    return get_and_reset_metric(task_id, &task_metrics::gpu_max_memory_allocated);
+  }
+
   void check_and_break_deadlocks()
   {
     std::unique_lock<std::mutex> lock(state_mutex);
@@ -807,7 +819,6 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
 
   bool cpu_prealloc(size_t const amount, bool const blocking)
   {
-    // amount is not used yet, but is here in case we want it in the future.
     std::unique_lock<std::mutex> lock(state_mutex);
     auto const thread_id = static_cast<long>(pthread_self());
     return pre_alloc_core(thread_id, true, blocking, lock);
@@ -820,10 +831,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
   {
     // addr is not used yet, but is here in case we want it in the future.
     // amount is not used yet, but is here in case we want it for debugging/metrics.
-    // blocking is not used yet. It could be used for some debugging so we are keeping it.
     std::unique_lock<std::mutex> lock(state_mutex);
     auto const thread_id = static_cast<long>(pthread_self());
-    post_alloc_success_core(thread_id, true, was_recursive, lock);
+    post_alloc_success_core(thread_id, true, was_recursive, amount, lock);
   }
 
   bool cpu_postalloc_failed(bool const was_oom, bool const blocking, bool const was_recursive)
@@ -838,7 +848,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     // addr is not used yet, but is here in case we want it in the future.
     // amount is not used yet, but is here in case we want it for debugging/metrics.
     std::unique_lock<std::mutex> lock(state_mutex);
-    dealloc_core(true, lock);
+    dealloc_core(true, lock, amount);
   }
 
   /**
@@ -1333,15 +1343,18 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
    * `likely_spill` if this allocation should be treated differently, because
    * we detected recursion while handling a prior allocation in this thread.
    */
-  void post_alloc_success(long const thread_id, bool const likely_spill)
+  void post_alloc_success(long const thread_id,
+                          bool const likely_spill,
+                          std::size_t const num_bytes)
   {
     std::unique_lock<std::mutex> lock(state_mutex);
-    post_alloc_success_core(thread_id, false, likely_spill, lock);
+    post_alloc_success_core(thread_id, false, likely_spill, num_bytes, lock);
   }
 
   void post_alloc_success_core(long const thread_id,
                                bool const is_for_cpu,
                                bool const was_recursive,
+                               std::size_t const num_bytes,
                                std::unique_lock<std::mutex>& lock)
   {
     // pre allocate checks
@@ -1360,6 +1373,14 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
           }
           transition(thread->second, thread_state::THREAD_RUNNING);
           thread->second.is_cpu_alloc = false;
+          // num_bytes is likely not padded, which could cause slight inaccuracies
+          // but for now it shouldn't matter for watermark purposes
+          if (!is_for_cpu) {
+            thread->second.gpu_memory_allocated_bytes += num_bytes;
+            thread->second.metrics.gpu_max_memory_allocated =
+              std::max(thread->second.metrics.gpu_max_memory_allocated,
+                       thread->second.gpu_memory_allocated_bytes);
+          }
           break;
         default: break;
       }
@@ -1735,7 +1756,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
       bool const likely_spill = pre_alloc(tid);
       try {
         void* ret = resource->allocate(num_bytes, stream);
-        post_alloc_success(tid, likely_spill);
+        post_alloc_success(tid, likely_spill, num_bytes);
         return ret;
       } catch (rmm::out_of_memory const& e) {
         // rmm::out_of_memory is what is thrown when an allocation failed
@@ -1751,7 +1772,9 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     throw rmm::bad_alloc("Internal Error");
   }
 
-  void dealloc_core(bool const is_for_cpu, std::unique_lock<std::mutex>& lock)
+  void dealloc_core(bool const is_for_cpu,
+                    std::unique_lock<std::mutex>& lock,
+                    std::size_t const num_bytes)
   {
     auto const tid    = static_cast<long>(pthread_self());
     auto const thread = threads.find(tid);
@@ -1779,6 +1802,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
             if (is_for_cpu == t_state.is_cpu_alloc) {
               transition(t_state, thread_state::THREAD_ALLOC_FREE);
             }
+            if (!is_for_cpu) { t_state.gpu_memory_allocated_bytes -= num_bytes; }
             break;
           default: break;
         }
@@ -1793,7 +1817,7 @@ class spark_resource_adaptor final : public rmm::mr::device_memory_resource {
     // deallocate success
     if (size > 0) {
       std::unique_lock<std::mutex> lock(state_mutex);
-      dealloc_core(false, lock);
+      dealloc_core(false, lock, size);
     }
   }
 };
@@ -2079,6 +2103,19 @@ Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetComputeTimeLost
   CATCH_STD(env, 0)
 }
 
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_getAndResetGpuMaxMemoryAllocated(
+  JNIEnv* env, jclass, jlong ptr, jlong task_id)
+{
+  JNI_NULL_CHECK(env, ptr, "resource_adaptor is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto mr = reinterpret_cast<spark_resource_adaptor*>(ptr);
+    return mr->get_and_reset_gpu_max_memory_allocated(task_id);
+  }
+  CATCH_STD(env, 0)
+}
+
 JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_SparkResourceAdaptor_startRetryBlock(
   JNIEnv* env, jclass, jlong ptr, jlong thread_id)
 {
diff --git a/src/main/cpp/src/cast_string.cu b/src/main/cpp/src/cast_string.cu
index 156dbeb7bf..88f1f0323f 100644
--- a/src/main/cpp/src/cast_string.cu
+++ b/src/main/cpp/src/cast_string.cu
@@ -28,6 +28,7 @@
 
 #include <cooperative_groups.h>
 #include <cub/warp/warp_reduce.cuh>
+#include <cuda/std/optional>
 
 using namespace cudf;
 
@@ -245,9 +246,8 @@ CUDF_KERNEL void string_to_integer_kernel(T* out,
 }
 
 template <typename T>
-__device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent(const char* chars,
-                                                                                 const int len,
-                                                                                 bool strip)
+__device__ cuda::std::optional<thrust::tuple<bool, int, int>> validate_and_exponent(
+  const char* chars, const int len, bool strip)
 {
   T exponent_val         = 0;
   int i                  = 0;
@@ -318,7 +318,7 @@ __device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent
     return state;
   };
 
-  if (len == 0) { return thrust::nullopt; }
+  if (len == 0) { return cuda::std::nullopt; }
 
   processing_state state = ST_DIGITS;
 
@@ -338,7 +338,7 @@ __device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent
   }
 
   // if there is no data left, this is invalid
-  if (i == len) { return thrust::nullopt; }
+  if (i == len) { return cuda::std::nullopt; }
 
   auto const first_digit = i;
   int last_digit         = len;
@@ -348,7 +348,7 @@ __device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent
     auto const last_state = state;
     state                 = validate_char(state, chr, char_num);
 
-    if (state == ST_INVALID) { return thrust::nullopt; }
+    if (state == ST_INVALID) { return cuda::std::nullopt; }
 
     if (last_state == ST_DIGITS && state != ST_DIGITS && state != ST_DECIMAL_POINT) {
       // past digits, save location
@@ -359,7 +359,7 @@ __device__ thrust::optional<thrust::tuple<bool, int, int>> validate_and_exponent
       T const new_digit = chr - '0';
       auto const [success, new_val] =
         process_value(exponent_val == 0, exponent_val, new_digit, exponent_positive);
-      if (!success) { return thrust::nullopt; }
+      if (!success) { return cuda::std::nullopt; }
       exponent_val = new_val;
     }
   }
diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/from_json.hpp
similarity index 87%
rename from src/main/cpp/src/map_utils.hpp
rename to src/main/cpp/src/from_json.hpp
index 96ba6f7e9b..75fc3bc103 100644
--- a/src/main/cpp/src/map_utils.hpp
+++ b/src/main/cpp/src/from_json.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/column/column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,8 +26,8 @@
 
 namespace spark_rapids_jni {
 
-std::unique_ptr<cudf::column> from_json(
-  cudf::column_view const& input,
+std::unique_ptr<cudf::column> from_json_to_raw_map(
+  cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/from_json_to_raw_map.cu
similarity index 98%
rename from src/main/cpp/src/map_utils.cu
rename to src/main/cpp/src/from_json_to_raw_map.cu
index ebb12eee93..73c2c4b559 100644
--- a/src/main/cpp/src/map_utils.cu
+++ b/src/main/cpp/src/from_json_to_raw_map.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "map_utils_debug.cuh"
+#include "from_json_to_raw_map_debug.cuh"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -59,7 +59,7 @@ namespace {
 // 1. Append one comma character (',') to the end of each input string, except the last one.
 // 2. Concatenate all input strings into one string.
 // 3. Add a pair of bracket characters ('[' and ']') to the beginning and the end of the output.
-rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
+rmm::device_uvector<char> unify_json_strings(cudf::strings_column_view const& input,
                                              rmm::cuda_stream_view stream)
 {
   if (input.is_empty()) {
@@ -67,9 +67,8 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
       std::vector<char>{'[', ']'}, stream, rmm::mr::get_current_device_resource());
   }
 
-  auto const d_strings  = cudf::column_device_view::create(input, stream);
-  auto const input_scv  = cudf::strings_column_view{input};
-  auto const chars_size = input_scv.chars_size(stream);
+  auto const d_strings  = cudf::column_device_view::create(input.parent(), stream);
+  auto const chars_size = input.chars_size(stream);
   auto const output_size =
     2l +                                            // two extra bracket characters '[' and ']'
     static_cast<int64_t>(chars_size) +
@@ -81,7 +80,7 @@ rmm::device_uvector<char> unify_json_strings(cudf::column_view const& input,
                "The input json column is too large and causes overflow.");
 
   auto const joined_input = cudf::strings::detail::join_strings(
-    input_scv,
+    input,
     cudf::string_scalar(","),   // append `,` character between the input rows
     cudf::string_scalar("{}"),  // replacement for null rows
     stream,
@@ -641,12 +640,10 @@ rmm::device_uvector<cudf::size_type> compute_list_offsets(
 
 }  // namespace
 
-std::unique_ptr<cudf::column> from_json(cudf::column_view const& input,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
+std::unique_ptr<cudf::column> from_json_to_raw_map(cudf::strings_column_view const& input,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format");
-
   // Firstly, concatenate all the input json strings into one giant input json string.
   // When testing/debugging, the output can be validated using
   // https://jsonformatter.curiousconcept.com.
@@ -718,7 +715,7 @@ std::unique_ptr<cudf::column> from_json(cudf::column_view const& input,
                                  std::move(offsets),
                                  std::move(structs_col),
                                  input.null_count(),
-                                 cudf::detail::copy_bitmask(input, stream, mr),
+                                 cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                  stream,
                                  mr);
 }
diff --git a/src/main/cpp/src/map_utils_debug.cuh b/src/main/cpp/src/from_json_to_raw_map_debug.cuh
similarity index 100%
rename from src/main/cpp/src/map_utils_debug.cuh
rename to src/main/cpp/src/from_json_to_raw_map_debug.cuh
diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu
index 690da3f702..f836186192 100644
--- a/src/main/cpp/src/get_json_object.cu
+++ b/src/main/cpp/src/get_json_object.cu
@@ -41,16 +41,12 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <numeric>
+
 namespace spark_rapids_jni {
 
 namespace detail {
 
-// path max depth limitation
-// There is a same constant in JSONUtil.java, keep them consistent when changing
-// Note: Spark-Rapids should guarantee the path depth is less or equal to this limit,
-// or GPU reports cudaErrorIllegalAddress
-constexpr int max_path_depth = 16;
-
 /**
  * @brief JSON style to write.
  */
@@ -379,29 +375,36 @@ struct context {
 /**
  * @brief Parse a single json string using the provided command buffer.
  *
- * @param input The incoming json string
+ * @param p The JSON parser for input string
  * @param path_commands The command buffer to be applied to the string
  * @param out_buf Buffer user to store the string resulted from the query
+ * @param max_path_depth_exceeded A marker to record if the maximum path depth has been reached
+ *        during parsing the input string
  * @return A pair containing the result code and the output size
  */
 __device__ thrust::pair<bool, cudf::size_type> evaluate_path(
-  char_range input, cudf::device_span<path_instruction const> path_commands, char* out_buf)
+  json_parser& p,
+  cudf::device_span<path_instruction const> path_commands,
+  char* out_buf,
+  int8_t* max_path_depth_exceeded)
 {
-  json_parser p{input};
   p.next_token();
   if (json_token::ERROR == p.get_current_token()) { return {false, 0}; }
 
-  // define stack; plus 1 indicates root context task needs an extra memory
-  context stack[max_path_depth + 1];
+  // Define stack; plus 1 indicates root context task needs an extra memory.
+  context stack[MAX_JSON_PATH_DEPTH + 1];
   int stack_size = 0;
 
-  // push context function
-  auto push_context = [&p, &stack, &stack_size](evaluation_case_path _case_path,
-                                                json_generator _g,
-                                                write_style _style,
-                                                cudf::device_span<path_instruction const> _path) {
-    // no need to check stack is full
-    // because Spark-Rapids already checked maximum length of `path_instruction`
+  auto const push_context = [&](evaluation_case_path _case_path,
+                                json_generator _g,
+                                write_style _style,
+                                cudf::device_span<path_instruction const> _path) {
+    if (stack_size > MAX_JSON_PATH_DEPTH) {
+      *max_path_depth_exceeded = 1;
+      // Because no more context is pushed, the evaluation output should be wrong.
+      // But that is not important, since we will throw exception after the kernel finishes.
+      return;
+    }
     auto& ctx          = stack[stack_size++];
     ctx.g              = std::move(_g);
     ctx.path           = std::move(_path);
@@ -413,7 +416,6 @@ __device__ thrust::pair<bool, cudf::size_type> evaluate_path(
     ctx.task_is_done   = false;
   };
 
-  // put the first context task
   push_context(evaluation_case_path::INVALID, json_generator{}, write_style::RAW, path_commands);
 
   while (stack_size > 0) {
@@ -818,12 +820,15 @@ struct json_path_processing_data {
  * @param input The input JSON strings stored in a strings column
  * @param path_data Array containing all path data
  * @param num_threads_per_row Number of threads processing each input row
+ * @param max_path_depth_exceeded A marker to record if the maximum path depth has been reached
+ *        during parsing the input string
  */
 template <int block_size, int min_block_per_sm>
 __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL
   void get_json_object_kernel(cudf::column_device_view input,
                               cudf::device_span<json_path_processing_data> path_data,
-                              std::size_t num_threads_per_row)
+                              std::size_t num_threads_per_row,
+                              int8_t* max_path_depth_exceeded)
 {
   auto const tidx    = cudf::detail::grid_1d::global_thread_id();
   auto const row_idx = tidx / num_threads_per_row;
@@ -839,7 +844,17 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL
 
   auto const str = input.element<cudf::string_view>(row_idx);
   if (str.size_bytes() > 0) {
-    thrust::tie(is_valid, out_size) = evaluate_path(char_range{str}, path.path_commands, dst);
+    json_parser p{char_range{str}};
+    thrust::tie(is_valid, out_size) =
+      evaluate_path(p, path.path_commands, dst, max_path_depth_exceeded);
+
+    // We did not terminate the `evaluate_path` function early to reduce complexity of the code.
+    // Instead, if max depth was encountered, we've just continued the evaluation until here
+    // then discard the output entirely.
+    if (p.max_nesting_depth_exceeded()) {
+      *max_path_depth_exceeded = 1;
+      return;
+    }
 
     auto const max_size = path.offsets[row_idx + 1] - path.offsets[row_idx];
     if (out_size > max_size) { *(path.has_out_of_bound) = 1; }
@@ -859,6 +874,7 @@ __launch_bounds__(block_size, min_block_per_sm) CUDF_KERNEL
 struct kernel_launcher {
   static void exec(cudf::column_device_view const& input,
                    cudf::device_span<json_path_processing_data> path_data,
+                   int8_t* max_path_depth_exceeded,
                    rmm::cuda_stream_view stream)
   {
     // The optimal values for block_size and min_block_per_sm were found through testing,
@@ -874,7 +890,8 @@ struct kernel_launcher {
     auto const num_blocks = cudf::util::div_rounding_up_safe(num_threads_per_row * input.size(),
                                                              static_cast<std::size_t>(block_size));
     get_json_object_kernel<block_size, min_block_per_sm>
-      <<<num_blocks, block_size, 0, stream.value()>>>(input, path_data, num_threads_per_row);
+      <<<num_blocks, block_size, 0, stream.value()>>>(
+        input, path_data, num_threads_per_row, max_path_depth_exceeded);
   }
 };
 
@@ -892,7 +909,7 @@ std::tuple<std::vector<rmm::device_uvector<path_instruction>>,
            cudf::string_scalar,
            std::string>
 construct_path_commands(
-  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&
+  std::vector<cudf::host_span<std::tuple<path_instruction_type, std::string, int32_t> const>> const&
     json_paths,
   rmm::cuda_stream_view stream)
 {
@@ -951,30 +968,10 @@ construct_path_commands(
           std::move(h_inst_names)};
 }
 
-std::vector<std::unique_ptr<cudf::column>> get_json_object(
-  cudf::strings_column_view const& input,
-  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&
-    json_paths,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+int64_t calc_scratch_size(cudf::strings_column_view const& input,
+                          cudf::detail::input_offsetalator const& in_offsets,
+                          rmm::cuda_stream_view stream)
 {
-  auto const num_outputs = json_paths.size();
-  std::vector<std::unique_ptr<cudf::column>> output;
-
-  // Input is empty or all nulls - just return all null columns.
-  if (input.is_empty() || input.size() == input.null_count()) {
-    for (std::size_t idx = 0; idx < num_outputs; ++idx) {
-      output.emplace_back(std::make_unique<cudf::column>(input.parent(), stream, mr));
-    }
-    return output;
-  }
-
-  auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream);
-  auto const in_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
-  auto const [d_json_paths, h_json_paths, d_inst_names, h_inst_names] =
-    construct_path_commands(json_paths, stream);
-
   auto const max_row_size = thrust::transform_reduce(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0),
@@ -994,8 +991,51 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
     auto constexpr padding_rows = 10;
     return input.chars_size(stream) + max_row_size * padding_rows;
   }();
+  return scratch_size;
+}
+
+/**
+ * @brief Error handling using error markers gathered after kernel launch.
+ *
+ * If the input JSON has nesting depth exceeds the maximum allowed value, an exception will be
+ * thrown as it is unacceptable. Otherwise, out of bound write is checked and returned.
+ *
+ * @param error_check The array of markers to check for error
+ * @return A boolean value indicating if there is any out of bound write
+ */
+bool check_error(cudf::detail::host_vector<int8_t> const& error_check)
+{
+  // The last value is to mark if nesting depth has exceeded.
+  CUDF_EXPECTS(error_check.back() == 0,
+               "The processed input has nesting depth exceeds depth limit.");
+
+  // Do not use parallel check since we do not have many elements.
+  // The last element is not related, but its value is already `0` thus just check until
+  // the end of the array for simplicity.
+  return std::none_of(
+    error_check.cbegin(), error_check.cend(), [](auto const val) { return val != 0; });
+}
+
+std::vector<std::unique_ptr<cudf::column>> get_json_object_batch(
+  cudf::column_device_view const& input,
+  cudf::detail::input_offsetalator const& in_offsets,
+  std::vector<cudf::host_span<std::tuple<path_instruction_type, std::string, int32_t> const>> const&
+    json_paths,
+  int64_t scratch_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto const [d_json_paths, h_json_paths, d_inst_names, h_inst_names] =
+    construct_path_commands(json_paths, stream);
+
+  auto const num_outputs = json_paths.size();
+  std::vector<std::unique_ptr<cudf::column>> output;
+
+  // The error check array contains markers denoting if there is any out-of-bound write occurs
+  // (first `num_outputs` elements), or if the nesting depth exceeded its limits (the last element).
+  rmm::device_uvector<int8_t> d_error_check(num_outputs + 1, stream);
+  auto const d_max_path_depth_exceeded = d_error_check.data() + num_outputs;
 
-  rmm::device_uvector<int8_t> d_has_out_of_bound(num_outputs, stream);
   std::vector<rmm::device_uvector<char>> scratch_buffers;
   std::vector<rmm::device_uvector<thrust::pair<char const*, cudf::size_type>>> out_stringviews;
   std::vector<json_path_processing_data> h_path_data;
@@ -1004,8 +1044,10 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
   h_path_data.reserve(json_paths.size());
 
   for (std::size_t idx = 0; idx < num_outputs; ++idx) {
-    auto const& instructions = json_paths[idx];
-    if (instructions.size() > max_path_depth) { CUDF_FAIL("JSONPath query exceeds maximum depth"); }
+    auto const& path = json_paths[idx];
+    if (path.size() > MAX_JSON_PATH_DEPTH) {
+      CUDF_FAIL("JSON Path has depth exceeds the maximum allowed value.");
+    }
 
     scratch_buffers.emplace_back(rmm::device_uvector<char>(scratch_size, stream));
     out_stringviews.emplace_back(rmm::device_uvector<thrust::pair<char const*, cudf::size_type>>{
@@ -1015,18 +1057,16 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
                                                        in_offsets,
                                                        out_stringviews.back().data(),
                                                        scratch_buffers.back().data(),
-                                                       d_has_out_of_bound.data() + idx});
+                                                       d_error_check.data() + idx});
   }
   auto d_path_data = cudf::detail::make_device_uvector_async(
     h_path_data, stream, rmm::mr::get_current_device_resource());
   thrust::uninitialized_fill(
-    rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0);
-  kernel_launcher::exec(*d_input_ptr, d_path_data, stream);
+    rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0);
 
-  // Do not use parallel check since we do not have many elements.
-  auto h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream);
-  auto has_no_oob         = std::none_of(
-    h_has_out_of_bound.begin(), h_has_out_of_bound.end(), [](auto const val) { return val != 0; });
+  kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream);
+  auto h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream);
+  auto has_no_oob    = check_error(h_error_check);
 
   // If we didn't see any out-of-bound write, everything is good so far.
   // Just gather the output strings and return.
@@ -1053,7 +1093,7 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
   for (std::size_t idx = 0; idx < num_outputs; ++idx) {
     auto const& out_sview = out_stringviews[idx];
 
-    if (h_has_out_of_bound[idx]) {
+    if (h_error_check[idx]) {
       oob_indices.emplace_back(idx);
       output.emplace_back(nullptr);  // just placeholder.
 
@@ -1079,7 +1119,7 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
                                     out_offsets_and_sizes.back().first->view()),
                                   nullptr /*out_stringviews*/,
                                   out_char_buffers.back().data(),
-                                  d_has_out_of_bound.data() + idx});
+                                  d_error_check.data() + idx});
     } else {
       output.emplace_back(cudf::make_strings_column(out_sview, stream, mr));
     }
@@ -1092,13 +1132,10 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
   d_path_data = cudf::detail::make_device_uvector_async(
     h_path_data, stream, rmm::mr::get_current_device_resource());
   thrust::uninitialized_fill(
-    rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0);
-  kernel_launcher::exec(*d_input_ptr, d_path_data, stream);
-
-  // Check out of bound again to make sure everything looks right.
-  h_has_out_of_bound = cudf::detail::make_host_vector_sync(d_has_out_of_bound, stream);
-  has_no_oob         = std::none_of(
-    h_has_out_of_bound.begin(), h_has_out_of_bound.end(), [](auto const val) { return val != 0; });
+    rmm::exec_policy(stream), d_error_check.begin(), d_error_check.end(), 0);
+  kernel_launcher::exec(input, d_path_data, d_max_path_depth_exceeded, stream);
+  h_error_check = cudf::detail::make_host_vector_sync(d_error_check, stream);
+  has_no_oob    = check_error(h_error_check);
 
   // The last kernel call should not encounter any out-of-bound write.
   // If OOB is still detected, there must be something wrong happened.
@@ -1116,27 +1153,104 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
   return output;
 }
 
+std::vector<std::unique_ptr<cudf::column>> get_json_object(
+  cudf::strings_column_view const& input,
+  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int32_t>>> const&
+    json_paths,
+  int64_t memory_budget_bytes,
+  int32_t parallel_override,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto const num_outputs = json_paths.size();
+
+  // Input is empty or all nulls - just return all null columns.
+  if (input.is_empty() || input.size() == input.null_count()) {
+    std::vector<std::unique_ptr<cudf::column>> output;
+    for (std::size_t idx = 0; idx < num_outputs; ++idx) {
+      output.emplace_back(std::make_unique<cudf::column>(input.parent(), stream, mr));
+    }
+    return output;
+  }
+
+  std::vector<std::size_t> sorted_indices(json_paths.size());
+  std::iota(sorted_indices.begin(), sorted_indices.end(), 0);  // Fill with 0, 1, 2, ...
+
+  // Sort indices based on the corresponding paths.
+  std::sort(sorted_indices.begin(), sorted_indices.end(), [&json_paths](size_t i, size_t j) {
+    return json_paths[i] < json_paths[j];
+  });
+
+  auto const in_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  auto const scratch_size = calc_scratch_size(input, in_offsets, stream);
+  if (memory_budget_bytes <= 0 && parallel_override <= 0) {
+    parallel_override = static_cast<int>(sorted_indices.size());
+  }
+  auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream);
+  std::vector<std::unique_ptr<cudf::column>> output(num_outputs);
+
+  std::vector<cudf::host_span<std::tuple<path_instruction_type, std::string, int32_t> const>> batch;
+  std::vector<std::size_t> output_ids;
+
+  std::size_t starting_path = 0;
+  while (starting_path < num_outputs) {
+    std::size_t at = starting_path;
+    batch.resize(0);
+    output_ids.resize(0);
+    if (parallel_override > 0) {
+      int count = 0;
+      while (at < num_outputs && count < parallel_override) {
+        auto output_location = sorted_indices[at];
+        batch.emplace_back(json_paths[output_location]);
+        output_ids.push_back(output_location);
+        at++;
+        count++;
+      }
+    } else {
+      long budget = 0;
+      while (at < num_outputs && budget < memory_budget_bytes) {
+        auto output_location = sorted_indices[at];
+        batch.emplace_back(json_paths[output_location]);
+        output_ids.push_back(output_location);
+        at++;
+        budget += scratch_size;
+      }
+    }
+    auto tmp = get_json_object_batch(*d_input_ptr, in_offsets, batch, scratch_size, stream, mr);
+    for (std::size_t i = 0; i < tmp.size(); i++) {
+      std::size_t out_i = output_ids[i];
+      output[out_i]     = std::move(tmp[i]);
+    }
+    starting_path = at;
+  }
+  return output;
+}
+
 }  // namespace detail
 
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
-  std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
+  std::vector<std::tuple<path_instruction_type, std::string, int32_t>> const& instructions,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::get_json_object(input, {instructions}, stream, mr).front());
+  return std::move(detail::get_json_object(input, {instructions}, -1, -1, stream, mr).front());
 }
 
 std::vector<std::unique_ptr<cudf::column>> get_json_object_multiple_paths(
   cudf::strings_column_view const& input,
-  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&
+  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int32_t>>> const&
     json_paths,
+  int64_t memory_budget_bytes,
+  int32_t parallel_override,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(input, json_paths, stream, mr);
+  return detail::get_json_object(
+    input, json_paths, memory_budget_bytes, parallel_override, stream, mr);
 }
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp
index 963bc91a74..0cc773517f 100644
--- a/src/main/cpp/src/get_json_object.hpp
+++ b/src/main/cpp/src/get_json_object.hpp
@@ -25,6 +25,11 @@
 
 namespace spark_rapids_jni {
 
+/**
+ * @brief The maximum supported depth that a JSON path can reach.
+ */
+constexpr int MAX_JSON_PATH_DEPTH = 16;
+
 /**
  * @brief Type of instruction in a JSON path.
  */
@@ -38,7 +43,7 @@ enum class path_instruction_type : int8_t { WILDCARD, INDEX, NAMED };
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& input,
-  std::vector<std::tuple<path_instruction_type, std::string, int64_t>> const& instructions,
+  std::vector<std::tuple<path_instruction_type, std::string, int32_t>> const& instructions,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
@@ -48,11 +53,19 @@ std::unique_ptr<cudf::column> get_json_object(
  * This function processes all the JSON paths in parallel, which may be faster than calling
  * to `get_json_object` on the individual JSON paths. However, it may consume much more GPU
  * memory, proportional to the number of JSON paths.
+ * @param input the input string column to parse JSON from
+ * @param json_paths the path operations to read extract
+ * @param memory_budget_bytes a memory budget for temporary memory usage if > 0
+ * @param parallel_override if this value is greater than 0 then it specifies the
+ *        number of paths to process in parallel (this will cause the
+ *        `memory_budget_bytes` paramemter to be ignored)
  */
 std::vector<std::unique_ptr<cudf::column>> get_json_object_multiple_paths(
   cudf::strings_column_view const& input,
-  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int64_t>>> const&
+  std::vector<std::vector<std::tuple<path_instruction_type, std::string, int32_t>>> const&
     json_paths,
+  int64_t memory_budget_bytes,
+  int32_t parallel_override,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh
index 10ad2e4fcc..4e712937ed 100644
--- a/src/main/cpp/src/json_parser.cuh
+++ b/src/main/cpp/src/json_parser.cuh
@@ -43,7 +43,7 @@ enum class escape_style {
  * JSON with a greater depth is invalid
  * If set this to be a greater value, should update `context_stack`
  */
-constexpr int max_json_nesting_depth = 64;
+constexpr int MAX_JSON_NESTING_DEPTH = 64;
 
 //
 /**
@@ -220,7 +220,7 @@ class char_range_reader {
 class json_parser {
  public:
   __device__ inline explicit json_parser(char_range _chars)
-    : chars(_chars), curr_pos(0), current_token(json_token::INIT)
+    : chars(_chars), curr_pos(0), current_token(json_token::INIT), max_depth_exceeded(false)
   {
   }
 
@@ -320,7 +320,7 @@ class json_parser {
    */
   __device__ inline bool try_push_context(json_token token)
   {
-    if (stack_size < max_json_nesting_depth) {
+    if (stack_size < MAX_JSON_NESTING_DEPTH) {
       push_context(token);
       return true;
     } else {
@@ -348,14 +348,8 @@ class json_parser {
     return get_bit_value(context_stack, stack_size - 1);
   }
 
-  /**
-   * pop top context from stack
-   */
   __device__ inline void pop_curr_context() { stack_size--; }
 
-  /**
-   * is context stack is empty
-   */
   __device__ inline bool is_context_stack_empty() const { return stack_size == 0; }
 
   __device__ inline void set_current_error() { current_token = json_token::ERROR; }
@@ -376,6 +370,7 @@ class json_parser {
     switch (c) {
       case '{':
         if (!try_push_context(json_token::START_OBJECT)) {
+          max_depth_exceeded = true;
           set_current_error();
         } else {
           curr_pos++;
@@ -384,6 +379,7 @@ class json_parser {
         break;
       case '[':
         if (!try_push_context(json_token::START_ARRAY)) {
+          max_depth_exceeded = true;
           set_current_error();
         } else {
           curr_pos++;
@@ -982,7 +978,7 @@ class json_parser {
 
     if (!to_match.is_null()) {
       for (cudf::size_type i = 0; i < bytes; i++) {
-        if (!(to_match.eof() && to_match.current_char() == buff[i])) { return false; }
+        if (to_match.eof() || to_match.current_char() != buff[i]) { return false; }
         to_match.next();
       }
     }
@@ -1688,6 +1684,8 @@ class json_parser {
     return thrust::make_pair(false, 0);
   }
 
+  __device__ inline bool max_nesting_depth_exceeded() const { return max_depth_exceeded; }
+
  private:
   char_range const chars;
   cudf::size_type curr_pos;
@@ -1707,6 +1705,9 @@ class json_parser {
   cudf::size_type number_token_len;
 
   json_token current_token;
+
+  // Error check if the maximum nesting depth has been reached.
+  bool max_depth_exceeded;
 };
 
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu
index f0a78f4f52..d2479403ba 100644
--- a/src/main/cpp/src/parse_uri.cu
+++ b/src/main/cpp/src/parse_uri.cu
@@ -32,6 +32,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 
 #include <memory>
 #include <optional>
@@ -534,7 +535,7 @@ __device__ std::pair<string_view, bool> find_query_part(string_view haystack, st
 
 uri_parts __device__ validate_uri(const char* str,
                                   int len,
-                                  thrust::optional<column_device_view const> query_match,
+                                  cuda::std::optional<column_device_view const> query_match,
                                   size_type row_idx)
 {
   uri_parts ret;
@@ -776,7 +777,7 @@ CUDF_KERNEL void parse_uri_char_counter(column_device_view const in_strings,
                                         size_type* const out_lengths,
                                         size_type* const out_offsets,
                                         bitmask_type* out_validity,
-                                        thrust::optional<column_device_view const> query_match)
+                                        cuda::std::optional<column_device_view const> query_match)
 {
   // thread per row
   auto const tid = cudf::detail::grid_1d::global_thread_id();
@@ -916,7 +917,7 @@ std::unique_ptr<column> parse_uri(strings_column_view const& input,
     offsets_mutable_view.begin<size_type>(),
     reinterpret_cast<size_type*>(src_offsets.data()),
     reinterpret_cast<bitmask_type*>(null_mask.data()),
-    d_matches ? thrust::optional<column_device_view const>{*d_matches} : thrust::nullopt);
+    d_matches ? cuda::std::optional<column_device_view const>{*d_matches} : cuda::std::nullopt);
 
   // use scan to transform number of bytes into offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
index e07f370533..3a7c4a6a53 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
@@ -25,8 +25,7 @@ public class JSONUtils {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  // Keep the same with `max_path_depth` in `get_json_object.cu'
-  public static final int MAX_PATH_DEPTH = 16;
+  public static final int MAX_PATH_DEPTH = getMaxJSONPathDepth();
 
   public enum PathInstructionType {
     WILDCARD,
@@ -35,25 +34,79 @@ public enum PathInstructionType {
   }
 
   public static class PathInstructionJni {
-    // type: Int, name: String, index: Long
-    private final int type;
+    // type: byte, name: String, index: int
+    private final byte type;
     private final String name;
-    private final long index;
+    private final int index;
 
     public PathInstructionJni(PathInstructionType type, String name, long index) {
-      this.type = type.ordinal();
+      this.type = (byte) type.ordinal();
+      this.name = name;
+      if (index > Integer.MAX_VALUE) {
+        throw new IllegalArgumentException("index is too large " + index);
+      }
+      this.index = (int) index;
+    }
+
+    public PathInstructionJni(PathInstructionType type, String name, int index) {
+      this.type = (byte) type.ordinal();
       this.name = name;
       this.index = index;
     }
   }
 
-  public static ColumnVector getJsonObject(ColumnVector input, PathInstructionJni[] path_instructions) {
+  /**
+   * Extract a JSON path from a JSON column. The path is processed in a Spark compatible way.
+   * @param input the string column containing JSON
+   * @param pathInstructions the instructions for the path processing
+   * @return the result of processing the path
+   */
+  public static ColumnVector getJsonObject(ColumnVector input, PathInstructionJni[] pathInstructions) {
     assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type";
-    return new ColumnVector(getJsonObject(input.getNativeView(), path_instructions));
+    int numTotalInstructions = pathInstructions.length;
+    byte[] typeNums = new byte[numTotalInstructions];
+    String[] names = new String[numTotalInstructions];
+    int[] indexes = new int[numTotalInstructions];
+
+    for (int i = 0; i < pathInstructions.length; i++) {
+      PathInstructionJni current = pathInstructions[i];
+      typeNums[i] = current.type;
+      names[i] = current.name;
+      indexes[i] = current.index;
+    }
+    return new ColumnVector(getJsonObject(input.getNativeView(), typeNums, names, indexes));
   }
 
+  /**
+   * Extract multiple JSON paths from a JSON column. The paths are processed in a Spark
+   * compatible way.
+   * @param input the string column containing JSON
+   * @param paths the instructions for multiple paths
+   * @return the result of processing each path in the order that they were passed in
+   */
   public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input,
                                                           List<List<PathInstructionJni>> paths) {
+    return getJsonObjectMultiplePaths(input, paths, -1, -1);
+  }
+
+  /**
+   * Extract multiple JSON paths from a JSON column. The paths are processed in a Spark
+   * compatible way.
+   * @param input the string column containing JSON
+   * @param paths the instructions for multiple paths
+   * @param memoryBudgetBytes a budget that is used to limit the amount of memory
+   *                          that is used when processing the paths. This is a soft limit.
+   *                          A value <= 0 disables this and all paths will be processed in parallel.
+   * @param parallelOverride Set a maximum number of paths to be processed in parallel. The memory
+   *                         budget can limit how many paths can be processed in parallel. This overrides
+   *                         that automatically calculated value with a set value for benchmarking purposes.
+   *                         A value <= 0 disables this.
+   * @return the result of processing each path in the order that they were passed in
+   */
+  public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input,
+                                                          List<List<PathInstructionJni>> paths,
+                                                          long memoryBudgetBytes,
+                                                          int parallelOverride) {
     assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type";
     int[] pathOffsets = new int[paths.size() + 1];
     int offset = 0;
@@ -62,15 +115,21 @@ public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input,
       offset += paths.get(i).size();
     }
     pathOffsets[paths.size()] = offset;
+    int numTotalInstructions = offset;
+    byte[] typeNums = new byte[numTotalInstructions];
+    String[] names = new String[numTotalInstructions];
+    int[] indexes = new int[numTotalInstructions];
 
-    int numTotalInstructions = pathOffsets[paths.size()];
-    PathInstructionJni[] pathsArray = new PathInstructionJni[numTotalInstructions];
     for (int i = 0; i < paths.size(); i++) {
       for (int j = 0; j < paths.get(i).size(); j++) {
-        pathsArray[pathOffsets[i] + j] = paths.get(i).get(j);
+        PathInstructionJni current = paths.get(i).get(j);
+        typeNums[pathOffsets[i] + j] = current.type;
+        names[pathOffsets[i] + j] = current.name;
+        indexes[pathOffsets[i] + j] = current.index;
       }
     }
-    long[] ptrs = getJsonObjectMultiplePaths(input.getNativeView(), pathsArray, pathOffsets);
+    long[] ptrs = getJsonObjectMultiplePaths(input.getNativeView(), typeNums,
+        names, indexes, pathOffsets, memoryBudgetBytes, parallelOverride);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
     for (int i = 0; i < ptrs.length; i++) {
       ret[i] = new ColumnVector(ptrs[i]);
@@ -78,8 +137,45 @@ public static ColumnVector[] getJsonObjectMultiplePaths(ColumnVector input,
     return ret;
   }
 
-  private static native long getJsonObject(long input, PathInstructionJni[] path_instructions);
 
-  private static native long[] getJsonObjectMultiplePaths(long input, PathInstructionJni[] paths,
-                                                          int[] pathOffsets);
+  /**
+   * Extract key-value pairs for each output map from the given json strings. These key-value are
+   * copied directly as substrings of the input without any type conversion.
+   * <p>
+   * Since there is not any validity check, the output of this function may be different from
+   * what generated by Spark's `from_json` function. Situations that can lead to
+   * different/incorrect outputs may include:<br>
+   * - The value in the input json string is invalid, such as 'abc' value for an integer key.<br>
+   * - The value string can be non-clean format for floating-point type, such as '1.00000'.
+   * <p>
+   * The output of these situations should all be NULL or a value '1.0', respectively. However, this
+   * function will just simply copy the input value strings to the output.
+   *
+   * @param input The input strings column in which each row specifies a json object
+   * @return A map column (i.e., a column of type {@code List<Struct<String,String>>}) in
+   * which the key-value pairs are extracted directly from the input json strings
+   */
+  public static ColumnVector extractRawMapFromJsonString(ColumnView input) {
+    assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type";
+    return new ColumnVector(extractRawMapFromJsonString(input.getNativeView()));
+  }
+
+
+  private static native int getMaxJSONPathDepth();
+
+  private static native long getJsonObject(long input,
+                                           byte[] typeNums,
+                                           String[] names,
+                                           int[] indexes);
+
+  private static native long[] getJsonObjectMultiplePaths(long input,
+                                                          byte[] typeNums,
+                                                          String[] names,
+                                                          int[] indexes,
+                                                          int[] pathOffsets,
+                                                          long memoryBudgetBytes,
+                                                          int parallelOverride);
+
+
+  private static native long extractRawMapFromJsonString(long input);
 }
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java
deleted file mode 100644
index 140455b462..0000000000
--- a/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.jni;
-
-import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.ColumnView;
-import ai.rapids.cudf.DType;
-import ai.rapids.cudf.NativeDepsLoader;
-
-public class MapUtils {
-  static {
-    NativeDepsLoader.loadNativeDeps();
-  }
-
-
-  /**
-   * Extract key-value pairs for each output map from the given json strings. These key-value are
-   * copied directly as substrings of the input without any type conversion.
-   * <p>
-   * Since there is not any validity check, the output of this function may be different from
-   * what generated by Spark's `from_json` function. Situations that can lead to
-   * different/incorrect outputs may include:<br>
-   * - The value in the input json string is invalid, such as 'abc' value for an integer key.<br>
-   * - The value string can be non-clean format for floating-point type, such as '1.00000'.
-   * <p>
-   * The output of these situations should all be NULL or a value '1.0', respectively. However, this
-   * function will just simply copy the input value strings to the output.
-   *
-   * @param jsonColumn The input strings column in which each row specifies a json object.
-   * @return A map column (i.e., a column of type {@code List<Struct<String,String>>}) in
-   * which the key-value pairs are extracted directly from the input json strings.
-   */
-  public static ColumnVector extractRawMapFromJsonString(ColumnView jsonColumn) {
-    assert jsonColumn.getType().equals(DType.STRING) : "Input type must be String";
-    return new ColumnVector(extractRawMapFromJsonString(jsonColumn.getNativeView()));
-  }
-
-
-  private static native long extractRawMapFromJsonString(long jsonColumnHandle);
-
-}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
index e171894601..45a234dcca 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RmmSpark.java
@@ -589,6 +589,22 @@ public static long getAndResetComputeTimeLostToRetryNs(long taskId) {
     }
   }
 
+  /**
+   * Get the max device memory footprint, in bytes, that this task had allocated over its lifetime
+   * @param taskId the id of the task to get the metric for.
+   * @return the max device memory footprint.
+   */
+  public static long getAndResetGpuMaxMemoryAllocated(long taskId) {
+    synchronized (Rmm.class) {
+      if (sra != null && sra.isOpen()) {
+        return sra.getAndResetGpuMaxMemoryAllocated(taskId);
+      } else {
+        // sra is not set so the value is by definition 0
+        return 0;
+      }
+    }
+  }
+
   /**
    * Called before doing an allocation on the CPU. This could throw an injected exception to help
    * with testing.
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
index d766c34230..9e3414f7d3 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/SparkResourceAdaptor.java
@@ -251,6 +251,10 @@ public long getAndResetComputeTimeLostToRetry(long taskId) {
     return getAndResetComputeTimeLostToRetry(getHandle(), taskId);
   }
 
+  public long getAndResetGpuMaxMemoryAllocated(long taskId) {
+    return getAndResetGpuMaxMemoryAllocated(getHandle(), taskId);
+  }
+
 
   /**
    * Called before doing an allocation on the CPU. This could throw an injected exception to help
@@ -319,6 +323,7 @@ public void cpuDeallocate(long ptr, long amount) {
   private static native int getAndResetSplitRetryThrowInternal(long handle, long taskId);
   private static native long getAndResetBlockTimeInternal(long handle, long taskId);
   private static native long getAndResetComputeTimeLostToRetry(long handle, long taskId);
+  private static native long getAndResetGpuMaxMemoryAllocated(long handle, long taskId);
   private static native void startRetryBlock(long handle, long threadId);
   private static native void endRetryBlock(long handle, long threadId);
   private static native void checkAndBreakDeadlocks(long handle);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java
similarity index 93%
rename from src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java
rename to src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java
index 773ef7ac37..8edff2f4c8 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/FromJsonToRawMapTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
-public class MapUtilsTest {
+public class FromJsonToRawMapTest {
 
   @Test
   void testFromJsonSimpleInput() {
@@ -36,7 +36,7 @@ void testFromJsonSimpleInput() {
 
     try (ColumnVector input =
              ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3);
-         ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input);
+         ColumnVector outputMap = JSONUtils.extractRawMapFromJsonString(input);
 
          ColumnVector expectedKeys = ColumnVector.fromStrings("Zipcode", "ZipCodeType", "City",
              "State", "category", "index", "author", "title", "price");
@@ -65,7 +65,7 @@ void testFromJsonWithUTF8() {
 
     try (ColumnVector input =
              ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3);
-         ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input);
+         ColumnVector outputMap = JSONUtils.extractRawMapFromJsonString(input);
 
          ColumnVector expectedKeys = ColumnVector.fromStrings("Zipc\u00f3de", "Z\u00edpCodeTyp" +
                  "\u00e9", "City", "St\u00e2te", "Zipc\u00f3de", "Z\u00edpCodeTyp\u00e9",
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
index 59d6c2bcb0..b33b0be8ce 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java
@@ -17,12 +17,14 @@
 package com.nvidia.spark.rapids.jni;
 
 import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.CudfException;
 import org.junit.jupiter.api.Test;
 
 import java.util.Arrays;
 import java.util.List;
 
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class GetJsonObjectTest {
   /**
@@ -665,6 +667,128 @@ void getJsonObjectMultiplePathsTest_JNIKernelCalledTwice() {
     }
   }
 
+  @Test
+  void getJsonObjectMultiplePathsTestCrazyLowMemoryBudget() {
+    List<JSONUtils.PathInstructionJni> path0 = Arrays.asList(namedPath("k0"));
+    List<JSONUtils.PathInstructionJni> path1 = Arrays.asList(namedPath("k1"));
+    List<List<JSONUtils.PathInstructionJni>> paths = Arrays.asList(path0, path1);
+    try (ColumnVector jsonCv = ColumnVector.fromStrings("{\"k0\": \"v0\", \"k1\": \"v1\"}");
+         ColumnVector expected0 = ColumnVector.fromStrings("v0");
+         ColumnVector expected1 = ColumnVector.fromStrings("v1")) {
+      ColumnVector[] output = JSONUtils.getJsonObjectMultiplePaths(jsonCv, paths, 1L, 0);
+      try {
+        assertColumnsAreEqual(expected0, output[0]);
+        assertColumnsAreEqual(expected1, output[1]);
+      } finally {
+        for (ColumnVector cv : output) {
+          cv.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void getJsonObjectMultiplePathsTestMemoryBudget() {
+    List<JSONUtils.PathInstructionJni> path0 = Arrays.asList(namedPath("k0"));
+    List<JSONUtils.PathInstructionJni> path1 = Arrays.asList(namedPath("k1"));
+    List<List<JSONUtils.PathInstructionJni>> paths = Arrays.asList(path0, path1);
+    try (ColumnVector jsonCv = ColumnVector.fromStrings("{\"k0\": \"v0\", \"k1\": \"v1\"}");
+         ColumnVector expected0 = ColumnVector.fromStrings("v0");
+         ColumnVector expected1 = ColumnVector.fromStrings("v1")) {
+      ColumnVector[] output = JSONUtils.getJsonObjectMultiplePaths(jsonCv, paths, 1024L, 0);
+      try {
+        assertColumnsAreEqual(expected0, output[0]);
+        assertColumnsAreEqual(expected1, output[1]);
+      } finally {
+        for (ColumnVector cv : output) {
+          cv.close();
+        }
+      }
+    }
+  }
+
+  /**
+   * This test is when an exception is thrown due to the input JSON path being too long.
+   */
+  @Test
+  void getJsonObjectTest_ExceedMaxNestingDepthInPath() {
+    JSONUtils.PathInstructionJni[] query =
+        new JSONUtils.PathInstructionJni[JSONUtils.MAX_PATH_DEPTH + 1];
+    for (int i = 0; i < JSONUtils.MAX_PATH_DEPTH + 1; ++i) {
+      query[i] = namedPath("k");
+    }
+    try (ColumnVector input = ColumnVector.fromStrings("")) {
+      assertThrows(CudfException.class, () -> JSONUtils.getJsonObject(input, query));
+    }
+  }
+
+  /**
+   * This test is when an exception is thrown due to maximum nesting depth being exceeded
+   * when pushing the context stack during evaluating the JSON path.
+   *
+   * The maximum depth limit here is the same as the limit for the input JSON path.
+   */
+  @Test
+  void getJsonObjectTest_ExceedMaxNestingDepthInContextStack() {
+    JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
+        wildcardPath(), wildcardPath()
+    };
+    String jsonStr = "\"v\"";
+    for (int i = 0; i < JSONUtils.MAX_PATH_DEPTH; ++i) {
+      jsonStr = String.format("[%s]", jsonStr);
+    }
+    // This string has nesting level exceeding the maximum depth.
+    String jsonStrTooDeep = String.format("[%s]", jsonStr);
+
+    try (ColumnVector validInput = ColumnVector.fromStrings(jsonStr);
+         ColumnVector invalidInput = ColumnVector.fromStrings(jsonStrTooDeep);
+         ColumnVector expected = ColumnVector.fromStrings("[\"v\"]");
+         ColumnVector output = JSONUtils.getJsonObject(validInput, query)) {
+      assertColumnsAreEqual(expected, output);
+      assertThrows(CudfException.class, () -> JSONUtils.getJsonObject(invalidInput, query));
+    }
+  }
+
+  /**
+   * This test is when an exception is thrown due to maximum nesting depth being exceeded
+   * in the JSON parser. The JSON path is simply mirroring the input.
+   *
+   * Note that the maximum depth in the internal parser, which is being tested here, is different
+   * from the limit for the input JSON path.
+   */
+  @Test
+  void getJsonObjectTest_ExceedMaxNestingDepthInJSONParser() {
+    // This is equivalent to the path '$'.
+    JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {};
+
+    final int MAX_PARSER_DEPTH = 64;
+    String jsonStr = "\"v\"";
+    for (int i = 0; i < MAX_PARSER_DEPTH; ++i) { // The maximum depth in JSON parser is 64.
+      jsonStr = String.format("{\"k%d\":%s}", i, jsonStr);
+    }
+    // This string has nesting level exceeding the maximum depth of 64.
+    String jsonStrTooDeep = String.format("{\"k%d\":%s}", MAX_PARSER_DEPTH, jsonStr);
+    try (ColumnVector validInput = ColumnVector.fromStrings(jsonStr);
+         ColumnVector invalidInput = ColumnVector.fromStrings(jsonStrTooDeep);
+         ColumnVector output = JSONUtils.getJsonObject(validInput, query)) {
+      assertColumnsAreEqual(validInput, output);
+      assertThrows(CudfException.class, () -> JSONUtils.getJsonObject(invalidInput, query));
+    }
+  }
+
+  @Test
+  void getJsonObjectTest_NamesWithEscapedCharacters() {
+    JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[] {
+        namedPath("data")
+    };
+    try (ColumnVector input = ColumnVector.fromStrings(
+        "{'data': 'TEST1'}", "{'\\u0064\\u0061t\\u0061': 'TEST2'}");
+         ColumnVector expected = ColumnVector.fromStrings("TEST1", "TEST2");
+         ColumnVector output = JSONUtils.getJsonObject(input, query)) {
+      assertColumnsAreEqual(expected, output);
+    }
+  }
+
   private JSONUtils.PathInstructionJni wildcardPath() {
     return new JSONUtils.PathInstructionJni(JSONUtils.PathInstructionType.WILDCARD, "", -1);
   }
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
index 373deb9ca0..987dd58534 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
@@ -43,6 +43,8 @@
 import static org.junit.jupiter.api.Assertions.fail;
 
 public class RmmSparkTest {
+  private final static long ALIGNMENT = 256;
+
   @BeforeEach
   public void setup() {
     if (Rmm.isInitialized()) {
@@ -317,6 +319,7 @@ public void testInsertOOMsGpu() {
     assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
     assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
     assertEquals(0, RmmSpark.getAndResetComputeTimeLostToRetryNs(taskid));
+    assertEquals(0, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid));
     RmmSpark.startDedicatedTaskThread(threadId, taskid, t);
     assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
     try {
@@ -343,6 +346,7 @@ public void testInsertOOMsGpu() {
       assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
       assertEquals(1, RmmSpark.getAndResetNumRetryThrow(taskid));
       assertEquals(0, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+      assertEquals(ALIGNMENT, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid));
       RmmSpark.blockThreadUntilReady();
 
       // Allocate something small and verify that it works...
@@ -356,6 +360,7 @@ public void testInsertOOMsGpu() {
       assertThrows(GpuSplitAndRetryOOM.class, () -> Rmm.alloc(100).close());
       assertEquals(0, RmmSpark.getAndResetNumRetryThrow(taskid));
       assertEquals(1, RmmSpark.getAndResetNumSplitRetryThrow(taskid));
+      assertEquals(ALIGNMENT * 2, RmmSpark.getAndResetGpuMaxMemoryAllocated(taskid));
 
       // Verify that injecting OOM does not cause the block to actually happen
       assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(threadId));
@@ -635,8 +640,8 @@ void setupRmmForTestingWithLimits(long maxAllocSize, RmmEventHandler eventHandle
     boolean succeeded = false;
     try {
       resource = new RmmCudaMemoryResource();
-      resource = new RmmLimitingResourceAdaptor<>(resource, maxAllocSize, 256);
-      resource = new RmmTrackingResourceAdaptor<>(resource, 256);
+      resource = new RmmLimitingResourceAdaptor<>(resource, maxAllocSize, ALIGNMENT);
+      resource = new RmmTrackingResourceAdaptor<>(resource, ALIGNMENT);
       Rmm.setCurrentDeviceResource(resource, null, false);
       succeeded = true;
     } finally {
@@ -760,9 +765,9 @@ public void testBasicCpuBlocking() throws ExecutionException, InterruptedExcepti
 
   @Test
   public void testBasicMixedBlocking() throws ExecutionException, InterruptedException, TimeoutException {
-    // 10 MiB
-    setupRmmForTestingWithLimits(10 * 1024 * 1024);
-    LimitingOffHeapAllocForTests.setLimit(10 * 1024 * 1024);
+    final long MB = 1024 * 1024;
+    setupRmmForTestingWithLimits(10 * MB);
+    LimitingOffHeapAllocForTests.setLimit(10 * MB);
     TaskThread taskOne = new TaskThread("TEST THREAD ONE", 1);
     TaskThread taskTwo = new TaskThread("TEST THREAD TWO", 2);
     TaskThread taskThree = new TaskThread("TEST THREAD THREE", 3);
@@ -771,6 +776,9 @@ public void testBasicMixedBlocking() throws ExecutionException, InterruptedExcep
     taskTwo.initialize();
     taskThree.initialize();
     taskFour.initialize();
+
+    final long FIVE_MB = 5 * MB;
+    final long SIX_MB = 6 * MB;
     try {
       long tOneId = taskOne.getThreadId();
       assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tOneId));
@@ -784,18 +792,18 @@ public void testBasicMixedBlocking() throws ExecutionException, InterruptedExcep
       long tFourId = taskFour.getThreadId();
       assertEquals(RmmSparkThreadState.THREAD_RUNNING, RmmSpark.getStateOf(tFourId));
 
-      try (AllocOnAnotherThread firstGpuAlloc = new GpuAllocOnAnotherThread(taskOne, 5 * 1024 * 1024)) {
+      try (AllocOnAnotherThread firstGpuAlloc = new GpuAllocOnAnotherThread(taskOne, FIVE_MB)) {
         firstGpuAlloc.waitForAlloc();
 
-        try (AllocOnAnotherThread firstCpuAlloc = new CpuAllocOnAnotherThread(taskTwo, 5 * 1024 * 1024)) {
+        try (AllocOnAnotherThread firstCpuAlloc = new CpuAllocOnAnotherThread(taskTwo, FIVE_MB)) {
           firstCpuAlloc.waitForAlloc();
 
           // Blocking GPU Alloc
-          try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, 6 * 1024 * 1024)) {
+          try (AllocOnAnotherThread secondGpuAlloc = new GpuAllocOnAnotherThread(taskThree, SIX_MB)) {
             taskThree.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
 
             // Blocking CPU Alloc
-            try (AllocOnAnotherThread secondCpuAlloc = new CpuAllocOnAnotherThread(taskFour, 6 * 1024 * 1024)) {
+            try (AllocOnAnotherThread secondCpuAlloc = new CpuAllocOnAnotherThread(taskFour, SIX_MB)) {
               taskFour.pollForState(RmmSparkThreadState.THREAD_BLOCKED, 1000, TimeUnit.MILLISECONDS);
 
               // We want to make sure that the order of wakeup corresponds to the location of the data that was released
@@ -814,9 +822,13 @@ public void testBasicMixedBlocking() throws ExecutionException, InterruptedExcep
       }
     } finally {
       taskOne.done();
+      assertEquals(FIVE_MB, RmmSpark.getAndResetGpuMaxMemoryAllocated(1));
       taskTwo.done();
+      assertEquals(0, RmmSpark.getAndResetGpuMaxMemoryAllocated(2));
       taskThree.done();
+      assertEquals(SIX_MB, RmmSpark.getAndResetGpuMaxMemoryAllocated(3));
       taskFour.done();
+      assertEquals(0, RmmSpark.getAndResetGpuMaxMemoryAllocated(4));
     }
   }
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4afeb5afa7..7b0adfa253 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4afeb5afa7ac483eef8f9a193c73fcce584db92b
+Subproject commit 7b0adfa2533e4792464230ee67916a04ce06caf6
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index dfef349453..37820d8ad4 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-a409f8169ffe75e709c07dc488293bf38ad64e5c
+312909127cf0fe96e178f0ffa754908f58d489a3
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index b8b32c33e5..ed40c777a4 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -1,14 +1,6 @@
 {
   "packages" : 
   {
-    "Arrow" : 
-    {
-      "always_download" : true,
-      "git_shallow" : false,
-      "git_tag" : "7dd1d34074af176d9e861a360e135ae57b21cf96",
-      "git_url" : "https://github.com/apache/arrow.git",
-      "version" : "16.1.0"
-    },
     "CCCL" : 
     {
       "always_download" : true,
@@ -52,9 +44,9 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9bdd0640ab0436373754ed4c9608be828330a4d6",
+      "git_tag" : "1b85263eba89c0f077fbb3da90a770b84161d20f",
       "git_url" : "https://github.com/rapidsai/kvikio.git",
-      "version" : "24.08"
+      "version" : "24.10"
     },
     "bs_thread_pool" : 
     {
@@ -68,7 +60,7 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "ee5c10456c7ad584c254152411ba3dc114537a6f",
+      "git_tag" : "d3477661d771e0d6fd22259bf6dd6f8c64a7401c",
       "git_url" : "https://github.com/NVIDIA/cuCollections.git",
       "version" : "0.0.1"
     },
@@ -92,17 +84,16 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "f5e54359df4c26b6230fc61d38aa294581393084",
+      "git_tag" : "0c9fce2ffefecfdce794e1859584e25877b7b592",
       "git_url" : "https://github.com/fmtlib/fmt.git",
       "patches" : 
       [
         {
-          "file" : "fmt/fix_10_1_1_version.diff",
-          "fixed_in" : "10.2.0",
-          "issue" : "fmt 10.1.1 produces a CMake package with version 10.1.0"
+          "file" : "fmt/fix_11_0_2_unreachable_loop.diff",
+          "issue" : "fmt 11.0.2 produces a warning about an unreachable loop when compiled with nvcc"
         }
       ],
-      "version" : "10.1.1"
+      "version" : "11.0.2"
     },
     "jitify" : 
     {
@@ -150,25 +141,17 @@
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "9d788cc0ece2676dc8a81079ecd1fcbf8804c72f",
+      "git_tag" : "afe0a3336397b17a96bb703e82f3b6365ee7c41e",
       "git_url" : "https://github.com/rapidsai/rmm.git",
-      "version" : "24.08"
+      "version" : "24.10"
     },
     "spdlog" : 
     {
       "always_download" : true,
       "git_shallow" : false,
-      "git_tag" : "7e635fca68d014934b4af8a1cf874f63989352b7",
+      "git_tag" : "27cb4c76708608465c413f6d0e6b8d99a4d84302",
       "git_url" : "https://github.com/gabime/spdlog.git",
-      "patches" : 
-      [
-        {
-          "file" : "spdlog/nvcc_constexpr_fix.diff",
-          "fixed_in" : "1.13",
-          "issue" : "Fix constexpr mismatch between spdlog and fmt [https://github.com/gabime/spdlog/issues/2856]"
-        }
-      ],
-      "version" : "1.12.0"
+      "version" : "1.14.1"
     }
   }
 }
\ No newline at end of file