From fe3cab5595337300345573d7e64fa52cba78a6c5 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:15:44 +0530
Subject: [PATCH 01/29] Fix Memcheck error found in JSON_TEST
 JsonReaderTest.ErrorStrings (#14164)

Fix missing null mask in string column names parsing. For parsing error, the row is made null. To write output properly, the nulls need to be passed so that they can be skipped during writing output stage in `parse_data`.
Fixes #14141

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Elias Stehle (https://github.com/elstehle)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14164
---
 cpp/src/io/utilities/data_casting.cu | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 1772e5e43fa..d16237d7afe 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -924,6 +924,9 @@ std::unique_ptr<column> parse_data(
   if (col_size == 0) { return make_empty_column(col_type); }
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
+  if (null_mask.is_empty()) {
+    null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
+  }
 
   // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});

From 3f47b5d463445faa9f95b1cc57c46fb5b41f60a7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 25 Sep 2023 11:28:33 -0400
Subject: [PATCH 02/29] Move cpp/src/hash/hash_allocator.cuh to
 include/cudf/hashing/detail (#14163)

Moves `cpp/src/hash/hash_allocator.cuh` to `include/cudf/hashing/detail` so it may be more accessible from non-src/hash source files.
Also, found `cpp/src/hash/helper_functions.hpp` used in the same way a moved that one as well.
No functional changes, just headers moved and includes fixed up.

Reference: https://github.com/rapidsai/cudf/pull/13930#discussion_r1330118935

Closes #14143

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14163
---
 .../hash => include/cudf/hashing/detail}/hash_allocator.cuh  | 0
 .../cudf/hashing/detail}/helper_functions.cuh                | 0
 cpp/src/hash/concurrent_unordered_map.cuh                    | 4 ++--
 cpp/src/hash/unordered_multiset.cuh                          | 3 +--
 cpp/src/io/json/json_tree.cu                                 | 4 ++--
 cpp/src/join/join_common_utils.hpp                           | 5 ++---
 cpp/src/stream_compaction/stream_compaction_common.hpp       | 5 ++---
 cpp/src/text/subword/bpe_tokenizer.cuh                       | 3 +--
 8 files changed, 10 insertions(+), 14 deletions(-)
 rename cpp/{src/hash => include/cudf/hashing/detail}/hash_allocator.cuh (100%)
 rename cpp/{src/hash => include/cudf/hashing/detail}/helper_functions.cuh (100%)

diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/include/cudf/hashing/detail/hash_allocator.cuh
similarity index 100%
rename from cpp/src/hash/hash_allocator.cuh
rename to cpp/include/cudf/hashing/detail/hash_allocator.cuh
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
similarity index 100%
rename from cpp/src/hash/helper_functions.cuh
rename to cpp/include/cudf/hashing/detail/helper_functions.cuh
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 439b1c2d066..d773c2763df 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <hash/managed.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 87075a39ea3..183042fc0f4 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include <hash/helper_functions.cuh>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 9231040eb70..da5b0eedfbd 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -15,8 +15,6 @@
  */
 
 #include "nested_json.hpp"
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
 #include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -24,7 +22,9 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4c1b1ed98b1..e96505e5ed6 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -17,13 +17,12 @@
 
 #include <cudf/detail/join.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index 58d958d2ff4..18c531e3e69 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -15,12 +15,11 @@
  */
 #pragma once
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
-#include <hash/hash_allocator.cuh>
-#include <hash/helper_functions.cuh>
-
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 83aa22aaae9..2fa879ea734 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -18,10 +18,9 @@
 
 #include <nvtext/bpe_tokenize.hpp>
 
-#include <hash/hash_allocator.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 

From 036c07d363406da9e500c3d6be9a3edca28fd6c2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 25 Sep 2023 06:36:26 -1000
Subject: [PATCH 03/29] Fix DataFrame from Series with different
 CategoricalIndexes (#14157)

closes #14130

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14157
---
 python/cudf/cudf/core/indexed_frame.py   |  7 +++++++
 python/cudf/cudf/tests/test_dataframe.py | 13 +++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62e091b29b5..aacf1fa8dae 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5438,6 +5438,13 @@ def _is_same_dtype(lhs_dtype, rhs_dtype):
     # for matching column dtype.
     if lhs_dtype == rhs_dtype:
         return True
+    elif (
+        is_categorical_dtype(lhs_dtype)
+        and is_categorical_dtype(rhs_dtype)
+        and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype
+    ):
+        # OK if categories are not all the same
+        return True
     elif (
         is_categorical_dtype(lhs_dtype)
         and not is_categorical_dtype(rhs_dtype)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2f531afdeb7..67b63028fab 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10408,6 +10408,19 @@ def test_dataframe_init_from_nested_dict():
     assert_eq(pdf, gdf)
 
 
+def test_init_from_2_categoricalindex_series_diff_categories():
+    s1 = cudf.Series(
+        [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"])
+    )
+    s2 = cudf.Series(
+        [2, 152, 2, 242, 150],
+        index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]),
+    )
+    result = cudf.DataFrame([s1, s2])
+    expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()])
+    assert_eq(result, expected, check_dtype=False)
+
+
 def test_data_frame_values_no_cols_but_index():
     result = cudf.DataFrame(index=range(5)).values
     expected = pd.DataFrame(index=range(5)).values

From ddd2b0dfac0903c5f17d581eca5d6b945ede9451 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 25 Sep 2023 13:14:18 -0500
Subject: [PATCH 04/29] Allow explicit `shuffle="p2p"` within dask-cudf API
 (#13893)

This PR allows explicit `shuffle="p2p"` usage within the dask-cudf API now that https://github.com/dask/distributed/pull/7743 is in.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13893
---
 python/dask_cudf/dask_cudf/backends.py        | 31 ++++++++++++++++---
 python/dask_cudf/dask_cudf/sorting.py         | 26 +++++++++++-----
 .../dask_cudf/tests/test_dispatch.py          | 11 +++++--
 .../dask_cudf/tests/test_distributed.py       | 22 ++++++++++++-
 4 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index e3f4f04eb85..344b03c631d 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -373,22 +373,37 @@ def percentile_cudf(a, q, interpolation="linear"):
 
 
 @pyarrow_schema_dispatch.register((cudf.DataFrame,))
-def _get_pyarrow_schema_cudf(obj, preserve_index=True, **kwargs):
+def _get_pyarrow_schema_cudf(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`pyarrow_schema_dispatch`: {list(kwargs)}"
         )
-    return meta_nonempty(obj).to_arrow(preserve_index=preserve_index).schema
+
+    return _cudf_to_table(
+        meta_nonempty(obj), preserve_index=preserve_index
+    ).schema
 
 
 @to_pyarrow_table_dispatch.register(cudf.DataFrame)
-def _cudf_to_table(obj, preserve_index=True, **kwargs):
+def _cudf_to_table(obj, preserve_index=None, **kwargs):
     if kwargs:
         warnings.warn(
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
+        obj = obj.copy()
+        obj.index.name = (
+            obj.index.name
+            if obj.index.name is not None
+            else "__index_level_0__"
+        )
+        obj.index = obj.index._as_int_index()
+
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -401,7 +416,15 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    return obj.from_arrow(table)
+    result = obj.from_arrow(table)
+
+    # TODO: Remove this logic when cudf#14159 is resolved
+    # (see: https://github.com/rapidsai/cudf/issues/14159)
+    if "__index_level_0__" in result.index.names:
+        assert len(result.index.names) == 1
+        result.index.name = None
+
+    return result
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e841f2d8830..d6c9c1be73c 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -6,7 +6,7 @@
 import numpy as np
 import tlz as toolz
 
-import dask
+from dask import config
 from dask.base import tokenize
 from dask.dataframe import methods
 from dask.dataframe.core import DataFrame, Index, Series
@@ -18,6 +18,8 @@
 from cudf.api.types import is_categorical_dtype
 from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
+_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
+
 
 @_dask_cudf_nvtx_annotate
 def set_index_post(df, index_name, drop, column_dtype):
@@ -307,15 +309,25 @@ def sort_values(
     return df4
 
 
+def get_default_shuffle_method():
+    # Note that `dask.utils.get_default_shuffle_method`
+    # will return "p2p" by default when a distributed
+    # client is present. Dask-cudf supports "p2p", but
+    # will not use it by default (yet)
+    default = config.get("dataframe.shuffle.method", "tasks")
+    if default not in _SHUFFLE_SUPPORT:
+        default = "tasks"
+    return default
+
+
 def _get_shuffle_type(shuffle):
     # Utility to set the shuffle-kwarg default
-    # and to validate user-specified options.
-    # The only supported options is currently "tasks"
-    shuffle = shuffle or dask.config.get("shuffle", "tasks")
-    if shuffle != "tasks":
+    # and to validate user-specified options
+    shuffle = shuffle or get_default_shuffle_method()
+    if shuffle not in _SHUFFLE_SUPPORT:
         raise ValueError(
-            f"Dask-cudf only supports in-memory shuffling with "
-            f"'tasks'. Got shuffle={shuffle}"
+            "Dask-cudf only supports the following shuffle "
+            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle={shuffle}"
         )
 
     return shuffle
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index cf49b1df4f4..c64e25fd437 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -22,18 +22,25 @@ def test_is_categorical_dispatch():
     assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
 
 
-def test_pyarrow_conversion_dispatch():
+@pytest.mark.parametrize("preserve_index", [True, False])
+def test_pyarrow_conversion_dispatch(preserve_index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
     df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
-    df2 = from_pyarrow_table_dispatch(df1, to_pyarrow_table_dispatch(df1))
+    df2 = from_pyarrow_table_dispatch(
+        df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
+    )
 
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 
+    # Check that preserve_index does not produce a RangeIndex
+    if preserve_index:
+        assert not isinstance(df2.index, cudf.RangeIndex)
+
 
 @pytest.mark.parametrize("index", [None, [1, 2] * 5])
 def test_deterministic_tokenize(index):
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index e24feaa2ea4..db3f3695648 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import numba.cuda
 import pytest
@@ -77,3 +77,23 @@ def test_str_series_roundtrip():
 
             actual = dask_series.compute()
             assert_eq(actual, expected)
+
+
+def test_p2p_shuffle():
+    # Check that we can use `shuffle="p2p"`
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            ddf = (
+                dask.datasets.timeseries(
+                    start="2000-01-01",
+                    end="2000-01-08",
+                    dtypes={"x": int},
+                )
+                .reset_index(drop=True)
+                .to_backend("cudf")
+            )
+            dd.assert_eq(
+                ddf.sort_values("x", shuffle="p2p").compute(),
+                ddf.compute().sort_values("x"),
+                check_index=False,
+            )

From 1b925bfc7741eb22fed0a978fa0e1d0d5dfee601 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 25 Sep 2023 13:09:16 -0700
Subject: [PATCH 05/29] Add Parquet reader benchmarks for row selection
 (#14147)

Re-enabled the group of benchmarks that compares row selection options in Parquet reader.
Use `read_parquet_metadata` to get the column names and number of row groups.
Clean up read chunk computation for ORC and Parquet benchmarks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14147
---
 cpp/benchmarks/io/cuio_common.cpp             | 18 ++---
 cpp/benchmarks/io/orc/orc_reader_options.cpp  | 12 ++--
 .../io/parquet/parquet_reader_options.cpp     | 65 +++++++++++--------
 3 files changed, 53 insertions(+), 42 deletions(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 6b8af91b842..b1aaef41340 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/io/cuio_common.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
 #include <cstdio>
@@ -141,17 +142,18 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
   return col_names_to_read;
 }
 
-std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk)
+std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk_idx)
 {
   CUDF_EXPECTS(num_segments >= num_chunks,
                "Number of chunks cannot be greater than the number of segments in the file");
-  auto start_segment = [num_segments, num_chunks](int chunk) {
-    return num_segments * chunk / num_chunks;
-  };
-  std::vector<cudf::size_type> selected_segments;
-  for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) {
-    selected_segments.push_back(segment);
-  }
+  CUDF_EXPECTS(chunk_idx < num_chunks,
+               "Chunk index must be smaller than the number of chunks in the file");
+
+  auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks);
+  auto const begin_segment     = std::min(chunk_idx * segments_in_chunk, num_segments);
+  auto const end_segment       = std::min(begin_segment + segments_in_chunk, num_segments);
+  std::vector<cudf::size_type> selected_segments(end_segment - begin_segment);
+  std::iota(selected_segments.begin(), selected_segments.end(), begin_segment);
 
   return selected_segments;
 }
diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
index 647a411c89d..1f656f7ea70 100644
--- a/cpp/benchmarks/io/orc/orc_reader_options.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -30,7 +31,7 @@
 constexpr int64_t data_size = 512 << 20;
 // The number of separate read calls to use when reading files in multiple chunks
 // Each call reads roughly equal amounts of data
-constexpr int32_t chunked_read_num_chunks = 8;
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
@@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
 
   auto const num_stripes =
     cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
-  cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
+  auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::STRIPES:
@@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
           case row_selection::NROWS:
             read_options.set_skip_rows(chunk * chunk_row_cnt);
             read_options.set_num_rows(chunk_row_cnt);
-            if (is_last_chunk) read_options.set_num_rows(-1);
             break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
@@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-using row_selections =
-  nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;
-
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,
@@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
     {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
   .set_min_samples(4);
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::STRIPES>;
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
                                       row_selections,
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 4105f2182d7..9f221de7da2 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 #include <benchmarks/io/nvbench_helpers.hpp>
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,21 +27,21 @@
 
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr std::size_t data_size      = 512 << 20;
-constexpr std::size_t row_group_size = 128 << 20;
+constexpr std::size_t data_size = 512 << 20;
+// The number of separate read calls to use when reading files in multiple chunks
+// Each call reads roughly equal amounts of data
+constexpr int32_t chunked_read_num_chunks = 4;
 
 std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
 {
-  cudf::io::parquet_reader_options const read_options =
-    cudf::io::parquet_reader_options::builder(source);
-  auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;
-
-  std::vector<std::string> names;
-  names.reserve(schema.size());
-  std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
-    return c.name;
-  });
-  return names;
+  auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children();
+  std::vector<std::string> col_names;
+  std::transform(top_lvl_cols.cbegin(),
+                 top_lvl_cols.cend(),
+                 std::back_inserter(col_names),
+                 [](auto const& col_meta) { return col_meta.name(); });
+
+  return col_names;
 }
 
 template <column_selection ColSelection,
@@ -55,6 +56,8 @@ void BM_parquet_read_options(nvbench::state& state,
                                                 nvbench::enum_type<UsesPandasMetadata>,
                                                 nvbench::enum_type<Timestamp>>)
 {
+  auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;
+
   auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES;
   auto constexpr uses_pd_metadata  = UsesPandasMetadata == uses_pandas_metadata::YES;
 
@@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state,
       .use_pandas_metadata(uses_pd_metadata)
       .timestamp_type(ts_type);
 
-  // TODO: add read_parquet_metadata to properly calculate #row_groups
-  auto constexpr num_row_groups = data_size / row_group_size;
-  auto constexpr num_chunks     = 1;
+  auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups();
+  auto const chunk_row_cnt  = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);
 
   auto mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
@@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state,
       timer.start();
       cudf::size_type rows_read = 0;
       for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
-        auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
           case row_selection::ROW_GROUPS: {
-            auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk);
-            if (is_last_chunk) {
-              // Need to assume that an additional "overflow" row group is present
-              row_groups_to_read.push_back(num_row_groups);
-            }
-            read_options.set_row_groups({row_groups_to_read});
+            read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)});
           } break;
-          case row_selection::NROWS: [[fallthrough]];
+          case row_selection::NROWS:
+            read_options.set_skip_rows(chunk * chunk_row_cnt);
+            read_options.set_num_rows(chunk_row_cnt);
+            break;
           default: CUDF_FAIL("Unsupported row selection method");
         }
 
@@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::ROW_GROUPS>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
+                                      row_selections,
+                                      nvbench::enum_type_list<converts_strings::YES>,
+                                      nvbench::enum_type_list<uses_pandas_metadata::YES>,
+                                      nvbench::enum_type_list<cudf::type_id::EMPTY>))
+  .set_name("parquet_read_row_selection")
+  .set_type_axes_names({"column_selection",
+                        "row_selection",
+                        "str_to_categories",
+                        "uses_pandas_metadata",
+                        "timestamp_type"})
+  .set_min_samples(4);
+
 using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
-
-// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file
-// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-
 NVBENCH_BENCH_TYPES(BM_parquet_read_options,
                     NVBENCH_TYPE_AXES(col_selections,
                                       nvbench::enum_type_list<row_selection::ALL>,

From f3402c402c2d0be54a6f2060e1bd74e284c1e687 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Mon, 25 Sep 2023 14:10:44 -0700
Subject: [PATCH 06/29] Add stream parameter to external dict APIs (#14115)

This PR adds stream parameter to public dictionary APIs, which include:

1. `cudf::dictionary::encode`
2. `cudf::dictionary::decode`
3. `cudf::dictionary::get_index`
4. `cudf::dictionary::add_keys`
5. `cudf::dictionary::remove_keys`
6. `cudf::dictionary::remove_unused_keys`
7. `cudf::dictionary::set_keys`
8. `cudf::dictionary::match_dictionaries`

Reference [13744](https://github.com/rapidsai/cudf/issues/13744)

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14115
---
 cpp/include/cudf/dictionary/encode.hpp      |   6 +-
 cpp/include/cudf/dictionary/search.hpp      |   6 +-
 cpp/include/cudf/dictionary/update_keys.hpp |  16 ++-
 cpp/include/cudf_test/column_wrapper.hpp    |  18 +++-
 cpp/src/dictionary/add_keys.cu              |   3 +-
 cpp/src/dictionary/decode.cu                |   5 +-
 cpp/src/dictionary/encode.cu                |   5 +-
 cpp/src/dictionary/remove_keys.cu           |   6 +-
 cpp/src/dictionary/search.cu                |  11 +-
 cpp/src/dictionary/set_keys.cu              |   9 +-
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/streams/dictionary_test.cpp       | 105 ++++++++++++++++++++
 12 files changed, 164 insertions(+), 27 deletions(-)
 create mode 100644 cpp/tests/streams/dictionary_test.cpp

diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index fb13eabe11a..959b785bf87 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,12 +53,14 @@ namespace dictionary {
  *
  * @param column The column to dictionary encode
  * @param indices_type The integer type to use for the indices
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Returns a dictionary column
  */
 std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type              = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -72,11 +74,13 @@ std::unique_ptr<column> encode(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with type matching the dictionary_column's keys
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index ed7a9c84693..1b72cf42acd 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,14 @@ namespace dictionary {
  *
  * @param dictionary The dictionary to search for the key.
  * @param key The value to search for in the dictionary keyset.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned scalar's device memory.
- * @return Numeric scalar index value of the key within the dictionary
+ * @return Numeric scalar index value of the key within the dictionary.
  */
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 2fcfb5e1f7c..81728e1ff73 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,13 +51,15 @@ namespace dictionary {
  * @throw cudf_logic_error if the new_keys contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param new_keys New keys to incorporate into the dictionary_column
+ * @param new_keys New keys to incorporate into the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,13 +83,15 @@ std::unique_ptr<column> add_keys(
  * @throw cudf_logic_error if the keys_to_remove contain nulls.
  *
  * @param dictionary_column Existing dictionary column.
- * @param keys_to_remove The keys to remove from the dictionary_column
+ * @param keys_to_remove The keys to remove from the dictionary_column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -103,11 +107,13 @@ std::unique_ptr<column> remove_keys(
  * @endcode
  *
  * @param dictionary_column Existing dictionary column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -134,12 +140,14 @@ std::unique_ptr<column> remove_unused_keys(
  *
  * @param dictionary_column Existing dictionary column.
  * @param keys New keys to use for the output column. Must not contain nulls.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -149,11 +157,13 @@ std::unique_ptr<column> set_keys(
  * The result is a vector of new dictionaries with a common set of keys.
  *
  * @param input Dictionary columns to match keys.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary columns.
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cc8cac35ef4..c0932b81dc3 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -944,8 +944,10 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   template <typename InputIterator>
   dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end));
+    wrapped =
+      cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
+                               cudf::data_type{type_id::UINT32},
+                               cudf::test::get_default_stream());
   }
 
   /**
@@ -978,7 +980,9 @@ class dictionary_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(
-      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v));
+      fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
+      cudf::data_type{type_id::UINT32},
+      cudf::test::get_default_stream());
   }
 
   /**
@@ -1134,7 +1138,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   template <typename StringsIterator>
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
@@ -1169,7 +1175,9 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v));
+    wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
+                                       cudf::data_type{type_id::UINT32},
+                                       cudf::test::get_default_stream());
   }
 
   /**
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index ab22c07e4d5..3973100aced 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -130,10 +130,11 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::add_keys(dictionary_column, keys, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 01411d06b62..fdf546b5875 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,10 +65,11 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 }  // namespace detail
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::decode(source, cudf::get_default_stream(), mr);
+  return detail::decode(source, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index fe8e777b694..c92b57f0cac 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,10 +89,11 @@ data_type get_indices_type_for_size(size_type keys_size)
 
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr);
+  return detail::encode(input_column, indices_type, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 9fe4a63373b..86b70f1119b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -195,17 +195,19 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr);
+  return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr);
+  return detail::remove_unused_keys(dictionary_column, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 8e97a387780..e35aded1984 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,10 +79,8 @@ struct find_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()),
-                                    keys_view->begin<Element>(),
-                                    keys_view->end<Element>(),
-                                    find_key);
+    auto iter        = thrust::equal_range(
+      rmm::exec_policy(stream), keys_view->begin<Element>(), keys_view->end<Element>(), find_key);
     return type_dispatcher(input.indices().type(),
                            dispatch_scalar_index{},
                            thrust::distance(keys_view->begin<Element>(), iter.first),
@@ -176,10 +174,11 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_index(dictionary, key, cudf::get_default_stream(), mr);
+  return detail::get_index(dictionary, key, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 36f5021d305..b49cf7850b1 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -241,17 +241,20 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
+  return detail::set_keys(dictionary_column, keys, stream, mr);
 }
 
 std::vector<std::unique_ptr<column>> match_dictionaries(
-  cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
+  cudf::host_span<dictionary_column_view const> input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::match_dictionaries(input, cudf::get_default_stream(), mr);
+  return detail::match_dictionaries(input, stream, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 956bfc7c27d..68ff6c54c99 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,6 +629,7 @@ ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
new file mode 100644
index 00000000000..f48e64c078e
--- /dev/null
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/dictionary/search.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class DictionaryTest : public cudf::test::BaseFixture {};
+
+TEST_F(DictionaryTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
+  cudf::data_type int32_type(cudf::type_id::UINT32);
+  cudf::column_view col_view = col;
+  cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, Decode)
+{
+  // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2}
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::decode(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, GetIndex)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::numeric_scalar<int32_t> key_scalar(2, true, cudf::test::get_default_stream());
+  cudf::dictionary::get_index(dict_col_view, key_scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, AddKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> new_keys_col({8, 9});
+  cudf::dictionary::add_keys(dict_col_view, new_keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_to_remove_col({2});
+  cudf::dictionary::remove_keys(
+    dict_col_view, keys_to_remove_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, RemoveUnsedKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::dictionary::remove_unused_keys(dict_col_view, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, SetKeys)
+{
+  std::vector<int32_t> elements{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col(elements.begin(), elements.end());
+  cudf::dictionary_column_view dict_col_view = dict_col;
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::set_keys(dict_col_view, keys_col, cudf::test::get_default_stream());
+}
+
+TEST_F(DictionaryTest, MatchDictionaries)
+{
+  std::vector<int32_t> elements_a{0, 2, 2, 6, 6};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_a(elements_a.begin(), elements_a.end());
+  cudf::dictionary_column_view dict_col_view_a = dict_col_a;
+
+  std::vector<int32_t> elements_b{1, 3, 4, 5, 5};
+  cudf::test::dictionary_column_wrapper<int32_t> dict_col_b(elements_b.begin(), elements_b.end());
+  cudf::dictionary_column_view dict_col_view_b = dict_col_b;
+
+  std::vector<cudf::dictionary_column_view> dicts = {dict_col_view_a, dict_col_view_b};
+
+  cudf::test::fixed_width_column_wrapper<int> keys_col({2, 6});
+  cudf::dictionary::match_dictionaries(dicts, cudf::test::get_default_stream());
+}

From 2e1a17d6519ea018921e35075306e01b4fdddf72 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 25 Sep 2023 15:53:55 -0700
Subject: [PATCH 07/29] Replace Python scalar conversions with libcudf (#14124)

This PR replaces the various Cython converters for different libcudf scalar types by using the new libcudf `[to|from]_arrow` overloads for scalars introduced in #14121. This change dramatically simplifies the Cython code and paves the way for implementation of a pylibcudf.Scalar object.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14124
---
 python/cudf/cudf/_lib/cpp/interop.pxd |  11 +-
 python/cudf/cudf/_lib/interop.pyx     |  95 +++++-
 python/cudf/cudf/_lib/scalar.pyx      | 448 +++++---------------------
 python/cudf/cudf/tests/test_list.py   |   4 +-
 python/cudf/cudf/tests/test_struct.py |  35 +-
 python/cudf/cudf/utils/dtypes.py      |  18 --
 6 files changed, 210 insertions(+), 401 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd
index e81f0d617fb..88e9d83ee98 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/cpp/interop.pxd
@@ -1,12 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable
+from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
@@ -24,6 +25,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
                                ) except +
 
     cdef unique_ptr[table] from_arrow(CTable input) except +
+    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
@@ -35,3 +37,8 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         table_view input,
         vector[column_metadata] metadata,
     ) except +
+
+    cdef shared_ptr[CScalar] to_arrow(
+        const scalar& input,
+        column_metadata metadata,
+    ) except +
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 8fd2a409d90..639754fc54f 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,7 +4,14 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
+from pyarrow.lib cimport (
+    CScalar,
+    CTable,
+    pyarrow_unwrap_scalar,
+    pyarrow_unwrap_table,
+    pyarrow_wrap_scalar,
+    pyarrow_wrap_table,
+)
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -14,12 +21,22 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport type_id
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -182,3 +199,79 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def to_arrow_scalar(DeviceScalar source_scalar):
+    """Convert a scalar to a PyArrow scalar.
+
+    Parameters
+    ----------
+    source_scalar : the scalar to convert
+
+    Returns
+    -------
+    pyarrow.lib.Scalar
+    """
+    cdef vector[column_metadata] cpp_metadata = gather_metadata(
+        [("", source_scalar.dtype)]
+    )
+    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
+
+    cdef shared_ptr[CScalar] cpp_arrow_scalar
+    with nogil:
+        cpp_arrow_scalar = cpp_to_arrow(
+            source_scalar_ptr[0], cpp_metadata[0]
+        )
+
+    return pyarrow_wrap_scalar(cpp_arrow_scalar)
+
+
+@acquire_spill_lock()
+def from_arrow_scalar(object input_scalar, output_dtype=None):
+    """Convert from PyArrow scalar to a cudf scalar.
+
+    Parameters
+    ----------
+    input_scalar : PyArrow scalar
+    output_dtype : output type to cast to, ignored except for decimals
+
+    Returns
+    -------
+    cudf._lib.DeviceScalar
+    """
+    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
+        pyarrow_unwrap_scalar(input_scalar)
+    )
+    cdef unique_ptr[scalar] c_result
+
+    with nogil:
+        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
+
+    cdef type_id ctype = c_result.get().type().id()
+    if ctype == type_id.DECIMAL128:
+        if output_dtype is None:
+            # Decimals must be cast to the cudf dtype of the right width
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        if isinstance(output_dtype, Decimal32Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        elif isinstance(output_dtype, Decimal64Dtype):
+            c_result.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
+                    scale_type(-input_scalar.type.scale),
+                    c_result.get().is_valid()
+                )
+            )
+        # Decimal128Dtype is a no-op, no conversion needed.
+
+    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 0407785b2d8..5ab286c5701 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -2,22 +2,13 @@
 
 cimport cython
 
-import decimal
+import copy
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
-from libc.stdint cimport (
-    int8_t,
-    int16_t,
-    int32_t,
-    int64_t,
-    uint8_t,
-    uint16_t,
-    uint32_t,
-    uint64_t,
-)
+from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,38 +16,22 @@ from libcpp.utility cimport move
 from rmm._lib.memory_resource cimport get_current_device_resource
 
 import cudf
-from cudf._lib.types import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    datetime_unit_map,
-    duration_unit_map,
-)
+from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
-from cudf._lib.interop import from_arrow, to_arrow
+from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
 
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
-    fixed_point_scalar,
     list_scalar,
-    numeric_scalar,
     scalar,
-    string_scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -69,7 +44,21 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
-from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
+
+
+def _replace_nested(obj, check, replacement):
+    if isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if check(item):
+                obj[i] = replacement
+            elif isinstance(item, (dict, list)):
+                _replace_nested(item, check, replacement)
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            if check(v):
+                obj[k] = replacement
+            elif isinstance(v, (dict, list)):
+                _replace_nested(v, check, replacement)
 
 
 # The DeviceMemoryResource attribute could be released prematurely
@@ -97,61 +86,61 @@ cdef class DeviceScalar:
             A NumPy dtype.
         """
         self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
-        self._set_value(value, self._dtype)
-
-    def _set_value(self, value, dtype):
-        # IMPORTANT: this should only ever be called from __init__
-        valid = not _is_null_host_scalar(value)
-
-        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            _set_decimal_from_scalar(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.ListDtype):
-            _set_list_from_pylist(
-                self.c_value, value, dtype, valid)
-        elif isinstance(dtype, cudf.StructDtype):
-            _set_struct_from_pydict(self.c_value, value, dtype, valid)
+
+        if cudf.utils.utils.is_na_like(value):
+            value = None
+        else:
+            # TODO: For now we always deepcopy the input value to avoid
+            # overwriting the input values when replacing nulls. Since it's
+            # just host values it's not that expensive, but we could consider
+            # alternatives.
+            value = copy.deepcopy(value)
+            _replace_nested(value, cudf.utils.utils.is_na_like, None)
+
+        if isinstance(dtype, cudf.core.dtypes._BaseDtype):
+            pa_type = dtype.to_arrow()
         elif pd.api.types.is_string_dtype(dtype):
-            _set_string_from_np_string(self.c_value, value, valid)
-        elif pd.api.types.is_numeric_dtype(dtype):
-            _set_numeric_from_np_scalar(self.c_value,
-                                        value,
-                                        dtype,
-                                        valid)
-        elif pd.api.types.is_datetime64_dtype(dtype):
-            _set_datetime64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
-        elif pd.api.types.is_timedelta64_dtype(dtype):
-            _set_timedelta64_from_np_scalar(
-                self.c_value, value, dtype, valid
-            )
+            # Have to manually convert object types, which we use internally
+            # for strings but pyarrow only supports as unicode 'U'
+            pa_type = pa.string()
         else:
-            raise ValueError(
-                f"Cannot convert value of type "
-                f"{type(value).__name__} to cudf scalar"
-            )
+            pa_type = pa.from_numpy_dtype(dtype)
+
+        pa_scalar = pa.scalar(value, type=pa_type)
+
+        # Note: This factory-like behavior in __init__ will be removed when
+        # migrating to pylibcudf.
+        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
+        self.c_value.swap(obj.c_value)
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype):
-            result = _get_py_decimal_from_fixed_point(self.c_value)
-        elif cudf.api.types.is_struct_dtype(self.dtype):
-            result = _get_py_dict_from_struct(self.c_value, self.dtype)
-        elif cudf.api.types.is_list_dtype(self.dtype):
-            result = _get_py_list_from_list(self.c_value, self.dtype)
-        elif pd.api.types.is_string_dtype(self.dtype):
-            result = _get_py_string_from_string(self.c_value)
-        elif pd.api.types.is_numeric_dtype(self.dtype):
-            result = _get_np_scalar_from_numeric(self.c_value)
-        elif pd.api.types.is_datetime64_dtype(self.dtype):
-            result = _get_np_scalar_from_timestamp64(self.c_value)
-        elif pd.api.types.is_timedelta64_dtype(self.dtype):
-            result = _get_np_scalar_from_timedelta64(self.c_value)
+        is_datetime = self.dtype.kind == "M"
+        is_timedelta = self.dtype.kind == "m"
+
+        null_type = NaT if is_datetime or is_timedelta else NA
+
+        ps = to_arrow_scalar(self)
+        if not ps.is_valid:
+            return null_type
+
+        # TODO: The special handling of specific types below does not currently
+        # extend to nested types containing those types (e.g. List[timedelta]
+        # where the timedelta would overflow). We should eventually account for
+        # those cases, but that will require more careful consideration of how
+        # to traverse the contents of the nested data.
+        if is_datetime or is_timedelta:
+            time_unit, _ = np.datetime_data(self.dtype)
+            # Cast to int64 to avoid overflow
+            ps_cast = ps.cast('int64').as_py()
+            out_type = np.datetime64 if is_datetime else np.timedelta64
+            ret = out_type(ps_cast, time_unit)
+        elif cudf.api.types.is_numeric_dtype(self.dtype):
+            ret = ps.type.to_pandas_dtype()(ps.as_py())
         else:
-            raise ValueError(
-                "Could not convert cudf::scalar to a Python value"
-            )
-        return result
+            ret = ps.as_py()
+
+        _replace_nested(ret, lambda item: item is None, NA)
+        return ret
 
     @property
     def dtype(self):
@@ -236,42 +225,9 @@ cdef class DeviceScalar:
         return s
 
 
-cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True):
-    value = value if valid else ""
-    s.reset(new string_scalar(value.encode(), valid))
-
-
-cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s,
-                                 object value,
-                                 object dtype,
-                                 bool valid=True):
-    value = value if valid else 0
-    if dtype == "int8":
-        s.reset(new numeric_scalar[int8_t](value, valid))
-    elif dtype == "int16":
-        s.reset(new numeric_scalar[int16_t](value, valid))
-    elif dtype == "int32":
-        s.reset(new numeric_scalar[int32_t](value, valid))
-    elif dtype == "int64":
-        s.reset(new numeric_scalar[int64_t](value, valid))
-    elif dtype == "uint8":
-        s.reset(new numeric_scalar[uint8_t](value, valid))
-    elif dtype == "uint16":
-        s.reset(new numeric_scalar[uint16_t](value, valid))
-    elif dtype == "uint32":
-        s.reset(new numeric_scalar[uint32_t](value, valid))
-    elif dtype == "uint64":
-        s.reset(new numeric_scalar[uint64_t](value, valid))
-    elif dtype == "float32":
-        s.reset(new numeric_scalar[float](value, valid))
-    elif dtype == "float64":
-        s.reset(new numeric_scalar[double](value, valid))
-    elif dtype == "bool":
-        s.reset(new numeric_scalar[bool](<bool>value, valid))
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-
+# TODO: Currently the only uses of this function and the one below are in
+# _create_proxy_nat_scalar. See if that code path can be simplified to excise
+# or at least simplify these implementations.
 cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s,
                                     object value,
                                     object dtype,
@@ -324,253 +280,6 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
-cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
-                              object value,
-                              object dtype,
-                              bool valid=True):
-    value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal64](
-                <int64_t>np.int64(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal32](
-                <int32_t>np.int32(value), scale_type(-dtype.scale), valid
-            )
-        )
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        s.reset(
-            new fixed_point_scalar[decimal128](
-                <libcudf_types.int128>value, scale_type(-dtype.scale), valid
-            )
-        )
-    else:
-        raise ValueError(f"dtype not supported: {dtype}")
-
-cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
-                             object value,
-                             object dtype,
-                             bool valid=True):
-    arrow_schema = dtype.to_arrow()
-    columns = [str(i) for i in range(len(arrow_schema))]
-    if valid:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([value[f.name]], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-    else:
-        pyarrow_table = pa.Table.from_arrays(
-            [
-                pa.array([NA], from_pandas=True, type=f.type)
-                for f in arrow_schema
-            ],
-            names=columns
-        )
-
-    data = from_arrow(pyarrow_table)
-    cdef table_view struct_view = table_view_from_columns(data)
-
-    s.reset(
-        new struct_scalar(struct_view, valid)
-    )
-
-cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype):
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
-    columns = columns_from_table_view(struct_table_view, None)
-    struct_col = cudf.core.column.build_struct_column(
-        names=dtype.fields.keys(),
-        children=tuple(columns),
-        size=1,
-    )
-    table = to_arrow([struct_col], [("None", dtype)])
-    python_dict = table.to_pydict()["None"][0]
-    return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict}
-
-cdef _set_list_from_pylist(unique_ptr[scalar]& s,
-                           object value,
-                           object dtype,
-                           bool valid=True):
-
-    value = value if valid else [NA]
-    cdef Column col
-    if isinstance(dtype.element_type, ListDtype):
-        pa_type = dtype.element_type.to_arrow()
-    else:
-        pa_type = dtype.to_arrow().value_type
-    col = cudf.core.column.as_column(
-        pa.array(value, from_pandas=True, type=pa_type)
-    )
-    cdef column_view col_view = col.view()
-    s.reset(
-        new list_scalar(col_view, valid)
-    )
-
-
-cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype):
-
-    if not s.get()[0].is_valid():
-        return NA
-
-    cdef column_view list_col_view = (<list_scalar*>s.get()).view()
-    cdef Column element_col = Column.from_column_view(list_col_view, None)
-
-    arrow_obj = to_arrow([element_col], [("None", dtype.element_type)])["None"]
-
-    result = arrow_obj.to_pylist()
-    return _nested_na_replace(result)
-
-
-cdef _get_py_string_from_string(unique_ptr[scalar]& s):
-    if not s.get()[0].is_valid():
-        return NA
-    return (<string_scalar*>s.get())[0].to_string().decode()
-
-
-cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.INT8:
-        return np.int8((<numeric_scalar[int8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT16:
-        return np.int16((<numeric_scalar[int16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT32:
-        return np.int32((<numeric_scalar[int32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.INT64:
-        return np.int64((<numeric_scalar[int64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT8:
-        return np.uint8((<numeric_scalar[uint8_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT16:
-        return np.uint16((<numeric_scalar[uint16_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT32:
-        return np.uint32((<numeric_scalar[uint32_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.UINT64:
-        return np.uint64((<numeric_scalar[uint64_t]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT32:
-        return np.float32((<numeric_scalar[float]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.FLOAT64:
-        return np.float64((<numeric_scalar[double]*>s_ptr)[0].value())
-    elif cdtype.id() == libcudf_types.type_id.BOOL8:
-        return np.bool_((<numeric_scalar[bool]*>s_ptr)[0].value())
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
-    cdef scalar* s_ptr = s.get()
-    if not s_ptr[0].is_valid():
-        return NA
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DECIMAL64:
-        rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL32:
-        rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    elif cdtype.id() == libcudf_types.type_id.DECIMAL128:
-        rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
-        scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
-        return decimal.Decimal(rep_val).scaleb(scale)
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.TIMESTAMP_SECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MILLISECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_MICROSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.TIMESTAMP_NANOSECONDS:
-        return np.datetime64(
-            (
-                <timestamp_scalar[timestamp_ms]*> s_ptr
-            )[0].ticks_since_epoch_64(),
-            datetime_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
-
-cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
-
-    cdef scalar* s_ptr = s.get()
-
-    if not s_ptr[0].is_valid():
-        return NaT
-
-    cdef libcudf_types.data_type cdtype = s_ptr[0].type()
-
-    if cdtype.id() == libcudf_types.type_id.DURATION_SECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_s]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MILLISECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ms]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_MICROSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_us]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    elif cdtype.id() == libcudf_types.type_id.DURATION_NANOSECONDS:
-        return np.timedelta64(
-            (
-                <duration_scalar[duration_ns]*> s_ptr
-            )[0].ticks(),
-            duration_unit_map[<underlying_type_t_type_id>(cdtype.id())]
-        )
-    else:
-        raise ValueError("Could not convert cudf::scalar to numpy scalar")
-
 
 def as_device_scalar(val, dtype=None):
     if isinstance(val, (cudf.Scalar, DeviceScalar)):
@@ -607,16 +316,3 @@ def _create_proxy_nat_scalar(dtype):
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
-
-
-def _nested_na_replace(input_list):
-    '''
-    Replace `None` with `cudf.NA` in the result of
-    `__getitem__` calls to list type columns
-    '''
-    for idx, value in enumerate(input_list):
-        if isinstance(value, list):
-            _nested_na_replace(value)
-        elif value is None:
-            input_list[idx] = NA
-    return input_list
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 5dd58d8a875..ac10dd97c56 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -895,14 +895,14 @@ def test_memory_usage():
     "data, idx",
     [
         (
-            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]],
+            [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]],
             0,
         ),
         (
             [
                 [
                     {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"},
-                    {"f1": "sf12", "f2": None},
+                    {"f1": "sf12", "f2": NA},
                 ]
             ],
             0,
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index a3593e55b97..ce6dc587320 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -150,9 +150,7 @@ def test_struct_setitem(data, item):
     "data",
     [
         {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]},
-        {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
         {"a": "Hello"},
-        {"b": [], "c": [1, 2, 3]},
     ],
 )
 def test_struct_scalar_host_construction(data):
@@ -161,6 +159,39 @@ def test_struct_scalar_host_construction(data):
     assert list(slr.device_value.value.values()) == list(data.values())
 
 
+@pytest.mark.parametrize(
+    ("data", "dtype"),
+    [
+        (
+            {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA},
+            cudf.StructDtype(
+                {
+                    "a": np.dtype(np.int64),
+                    "b": np.dtype(np.str_),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                    "d": np.dtype(np.int64),
+                }
+            ),
+        ),
+        (
+            {"b": [], "c": [1, 2, 3]},
+            cudf.StructDtype(
+                {
+                    "b": cudf.ListDtype(np.dtype(np.int64)),
+                    "c": cudf.ListDtype(np.dtype(np.int64)),
+                }
+            ),
+        ),
+    ],
+)
+def test_struct_scalar_host_construction_no_dtype_inference(data, dtype):
+    # cudf cannot infer the dtype of the scalar when it contains only nulls or
+    # is empty.
+    slr = cudf.Scalar(data, dtype=dtype)
+    assert slr.value == data
+    assert list(slr.device_value.value.values()) == list(data.values())
+
+
 def test_struct_scalar_null():
     slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
     assert slr.device_value.value is cudf.NA
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1b94db75340..73ea8e2cfc4 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -463,24 +463,6 @@ def _get_nan_for_dtype(dtype):
         return np.float64("nan")
 
 
-def _decimal_to_int64(decimal: Decimal) -> int:
-    """
-    Scale a Decimal such that the result is the integer
-    that would result from removing the decimal point.
-
-    Examples
-    --------
-    >>> _decimal_to_int64(Decimal('1.42'))
-    142
-    >>> _decimal_to_int64(Decimal('0.0042'))
-    42
-    >>> _decimal_to_int64(Decimal('-1.004201'))
-    -1004201
-
-    """
-    return int(f"{decimal:0f}".replace(".", ""))
-
-
 def get_allowed_combinations_for_operator(dtype_l, dtype_r, op):
     error = TypeError(
         f"{op} not supported between {dtype_l} and {dtype_r} scalars"

From daea8c8bc37ec53b7347857a3b6795bcb0ad86ff Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 26 Sep 2023 09:11:31 -0400
Subject: [PATCH 08/29] Disable `Recently Updated` Check (#14193)

This check occasionally hangs for `cudf` for unknown reasons.

Upon checking the application logs, the GitHub API seems to be returning responses that aren't helpful in troubleshooting the problem.

Therefore, it's probably best to just remove the check to avoid confusion.

[skip ci]

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/ops-bot.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 9a0b4155035..d2ca78924e1 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,4 +5,3 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-recently_updated: true

From 3196f6c36140962818aa8d12fe4fbd0dc522e31e Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:54:18 -0500
Subject: [PATCH 09/29] update rmm tag path (#14195)

PR updates the download path of the `rmm` tag used in `build_docs.sh` following the re-arrangement of the docs directories.

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14195
---
 ci/build_docs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 1ed047a500b..9149b5e6bfe 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -30,7 +30,7 @@ export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
-aws s3 cp s3://rapidsai-docs/librmm/${RAPIDS_VERSION_NUMBER}/html/rmm.tag . || echo "Failed to download rmm Doxygen tag"
+aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"

From a9ec350217331979359c50ea1da9457e9973f719 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 26 Sep 2023 14:32:04 -0500
Subject: [PATCH 10/29] Fix pytorch related pytest (#14198)

Calling `cudf.Index([])` results in `str` dtype `Index`. This PR fixes an issue with a pytorch related pytest by explicitly passing a `float64` dtype.

xref: https://github.com/rapidsai/cudf/pull/14116

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14198
---
 python/cudf/cudf/tests/test_cuda_array_interface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index e81f4ec795a..848c77206b2 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import types
 from contextlib import ExitStack as does_not_raise
@@ -193,7 +193,7 @@ def test_cuda_array_interface_pytorch():
 
     assert_eq(got, cudf.Series(buffer, dtype=np.bool_))
 
-    index = cudf.Index([])
+    index = cudf.Index([], dtype="float64")
     tensor = torch.tensor(index)
     got = cudf.Index(tensor)
     assert_eq(got, index)

From 030c0f4995ec458fcfc00a4ebb3aa8bccb2b27a0 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 26 Sep 2023 12:42:12 -0700
Subject: [PATCH 11/29] Refactor `contains_table` with cuco::static_set
 (#14064)

Contributes to #12261

This PR refactors `contains_table` to use the new `cuco::static_set` data structure. It also adds a `contains_table` benchmark to track the performance before and after this work.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14064
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 .../{contains.cpp => contains_scalar.cpp}     |   0
 cpp/benchmarks/search/contains_table.cpp      |  73 ++++
 cpp/include/cudf/detail/search.hpp            |   2 +
 cpp/src/search/contains_table.cu              | 319 +++++++++---------
 5 files changed, 229 insertions(+), 167 deletions(-)
 rename cpp/benchmarks/search/{contains.cpp => contains_scalar.cpp} (100%)
 create mode 100644 cpp/benchmarks/search/contains_table.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5e7862f4b3b..cd6b3cfdc03 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -173,7 +173,7 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator.cu)
 # ##################################################################################################
 # * search benchmark ------------------------------------------------------------------------------
 ConfigureBench(SEARCH_BENCH search/search.cpp)
-ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
+ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table.cpp)
 
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/search/contains.cpp b/cpp/benchmarks/search/contains_scalar.cpp
similarity index 100%
rename from cpp/benchmarks/search/contains.cpp
rename to cpp/benchmarks/search/contains_scalar.cpp
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
new file mode 100644
index 00000000000..17702d0741c
--- /dev/null
+++ b/cpp/benchmarks/search/contains_table.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/search.hpp>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+auto constexpr num_unique_elements = 1000;
+
+template <typename Type>
+static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const size               = state.get_int64("table_size");
+  auto const dtype              = cudf::type_to_id<Type>();
+  double const null_probability = state.get_float64("null_probability");
+
+  auto builder = data_profile_builder().null_probability(null_probability);
+  if (dtype == cudf::type_id::LIST) {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements)
+      .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_unique_elements)
+      .list_depth(1);
+  } else {
+    builder.distribution(dtype, distribution_id::UNIFORM, 0, num_unique_elements);
+  }
+
+  auto const haystack = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
+  auto const needles = create_random_table(
+    {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 1);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const stream_view = rmm::cuda_stream_view{launch.get_stream()};
+    [[maybe_unused]] auto const result =
+      cudf::detail::contains(haystack->view(),
+                             needles->view(),
+                             cudf::null_equality::EQUAL,
+                             cudf::nan_equality::ALL_EQUAL,
+                             stream_view,
+                             rmm::mr::get_current_device_resource());
+  });
+
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(nvbench_contains_table,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
+  .set_name("contains_table")
+  .set_type_axes_names({"type"})
+  .add_float64_axis("null_probability", {0.0, 0.1})
+  .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000});
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4c4ad7834f4..4277baf3edd 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -81,6 +81,8 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * output   = { false, true, true }
  * @endcode
  *
+ * @throws cudf::logic_error If column types of haystack and needles don't match
+ *
  * @param haystack The table containing the search space
  * @param needles A table of rows whose existence to check in the search space
  * @param compare_nulls Control whether nulls should be compared as equal or not
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index e37f0686ac3..43624ba691d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -26,7 +26,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <cuco/static_map.cuh>
+#include <cuco/static_set.cuh>
 
 #include <type_traits>
 
@@ -37,69 +37,59 @@ namespace {
 using cudf::experimental::row::lhs_index_type;
 using cudf::experimental::row::rhs_index_type;
 
-using static_map = cuco::static_map<lhs_index_type,
-                                    size_type,
-                                    cuda::thread_scope_device,
-                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
-
 /**
- * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
- * `rhs_index_type`).
- *
- * @return A boolean value indicating if `T` is a strong index type
+ * @brief An hasher adapter wrapping both haystack hasher and needles hasher
  */
-template <typename T>
-constexpr auto is_strong_index_type()
-{
-  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
-}
+template <typename HaystackHasher, typename NeedleHasher>
+struct hasher_adapter {
+  hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher)
+    : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher}
+  {
+  }
 
-/**
- * @brief An adapter functor to support strong index types for row hasher that must be operating on
- * `cudf::size_type`.
- */
-template <typename Hasher>
-struct strong_index_hasher_adapter {
-  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
+  __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
+  {
+    return _haystack_hasher(static_cast<size_type>(idx));
+  }
 
-  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
-  __device__ constexpr auto operator()(T const idx) const noexcept
+  __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
   {
-    return _hasher(static_cast<size_type>(idx));
+    return _needle_hasher(static_cast<size_type>(idx));
   }
 
  private:
-  Hasher const _hasher;
+  HaystackHasher const _haystack_hasher;
+  NeedleHasher const _needle_hasher;
 };
 
 /**
- * @brief An adapter functor to support strong index type for table row comparator that must be
- * operating on `cudf::size_type`.
+ * @brief An comparator adapter wrapping both self comparator and two table comparator
  */
-template <typename Comparator>
-struct strong_index_comparator_adapter {
-  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
-
-  template <typename T,
-            typename U,
-            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
-  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
+template <typename SelfEqual, typename TwoTableEqual>
+struct comparator_adapter {
+  comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal)
+    : _self_equal{self_equal}, _two_table_equal{two_table_equal}
+  {
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
   {
     auto const lhs = static_cast<size_type>(lhs_index);
     auto const rhs = static_cast<size_type>(rhs_index);
 
-    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
-      return _comparator(lhs, rhs);
-    } else {
-      // Here we have T == rhs_index_type.
-      // This is when the indices are provided in wrong order for two table comparator, so we need
-      // to switch them back to the right order before calling the underlying comparator.
-      return _comparator(rhs, lhs);
-    }
+    return _self_equal(lhs, rhs);
+  }
+
+  __device__ constexpr auto operator()(lhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
   }
 
  private:
-  Comparator const _comparator;
+  SelfEqual const _self_equal;
+  TwoTableEqual const _two_table_equal;
 };
 
 /**
@@ -134,38 +124,62 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
 }
 
 /**
- * @brief Invoke an `operator()` template with a row equality comparator based on the specified
- * `compare_nans` parameter.
+ * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans`
+ * parameter
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles
+ * @tparam Hasher Type of device hash function
+ * @tparam Func Type of the helper function doing `contains` check
  *
- * @param compare_nans The flag to specify whether NaNs should be compared equal or not
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param haystack_has_nulls Flag indicating whether haystack has nulls or not
+ * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles
+ * @param self_equal Self table comparator
+ * @param two_table_equal Two table comparator
+ * @param d_hasher Device hash functor
  * @param func The input functor to invoke
  */
-template <typename Func>
-void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
+template <bool HasNested, typename Hasher, typename Func>
+void dispatch_nan_comparator(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool haystack_has_nulls,
+  bool has_any_nulls,
+  cudf::experimental::row::equality::self_comparator self_equal,
+  cudf::experimental::row::equality::two_table_comparator two_table_equal,
+  Hasher const& d_hasher,
+  Func&& func)
 {
+  // Distinguish probing scheme CG sizes between nested and flat types for better performance
+  auto const probing_scheme = [&]() {
+    if constexpr (HasNested) {
+      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
+    } else {
+      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
+    }
+  }();
+
   if (compare_nans == nan_equality::ALL_EQUAL) {
     using nan_equal_comparator =
       cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    func(nan_equal_comparator{});
+    auto const d_self_equal = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   } else {
     using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    func(nan_unequal_comparator{});
+    auto const d_self_equal      = self_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{});
+    auto const d_two_table_equal = two_table_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{});
+    func(d_self_equal, d_two_table_equal, probing_scheme);
   }
 }
 
 }  // namespace
 
-/**
- * @brief Check if rows in the given `needles` table exist in the `haystack` table.
- *
- * @param haystack The table containing the search space
- * @param needles A table of rows whose existence to check in the search space
- * @param compare_nulls Control whether nulls should be compared as equal or not
- * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
- */
 rmm::device_uvector<bool> contains(table_view const& haystack,
                                    table_view const& needles,
                                    null_equality compare_nulls,
@@ -173,124 +187,97 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  auto map = static_map(compute_hash_table_size(haystack.num_rows()),
-                        cuco::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
-                        cuco::empty_value{detail::JoinNoneValue},
-                        detail::hash_table_allocator_type{default_allocator<char>{}, stream},
-                        stream.value());
+  CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
   auto const haystack_has_nulls = has_nested_nulls(haystack);
   auto const needles_has_nulls  = has_nested_nulls(needles);
   auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
+  auto const preprocessed_needles =
+    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
   auto const preprocessed_haystack =
     cudf::experimental::row::equality::preprocessed_table::create(haystack, stream);
-  // Insert row indices of the haystack table as map keys.
-  {
-    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0},
-      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator =
-      cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
-
-    // If the haystack table has nulls but they are compared unequal, don't insert them.
-    // Otherwise, it was known to cause performance issue:
-    // - https://github.com/rapidsai/cudf/pull/6943
-    // - https://github.com/rapidsai/cudf/pull/8277
-    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
-      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
-
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert_if(haystack_it,
-                        haystack_it + haystack.num_rows(),
-                        thrust::counting_iterator<size_type>(0),  // stencil
-                        row_is_valid{row_bitmask_ptr},
-                        d_hasher,
-                        d_eqcomp,
-                        stream.value());
-        }
-      };
-
-      // Insert only rows that do not have any null at any level.
-      dispatch_nan_comparator(compare_nans, insert_map);
-    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
-      auto const insert_map = [&](auto const value_comp) {
-        if (cudf::detail::has_nested_columns(haystack)) {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<true>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        } else {
-          auto const d_eqcomp = strong_index_comparator_adapter{comparator.equal_to<false>(
-            nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
-          map.insert(
-            haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
-        }
-      };
-
-      dispatch_nan_comparator(compare_nans, insert_map);
-    }
-  }
+
+  auto const haystack_hasher   = cudf::experimental::row::hash::row_hasher(preprocessed_haystack);
+  auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const needle_hasher     = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
+  auto const d_needle_hasher   = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
+  auto const d_hasher          = hasher_adapter{d_haystack_hasher, d_needle_hasher};
+
+  auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_haystack, preprocessed_needles);
 
   // The output vector.
   auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
 
-  auto const preprocessed_needles =
-    cudf::experimental::row::equality::preprocessed_table::create(needles, stream);
-  // Check existence for each row of the needles table in the haystack table.
-  {
-    auto const needles_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
-
-    auto const hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles);
-    auto const d_hasher =
-      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
-
-    auto const comparator = cudf::experimental::row::equality::two_table_comparator(
-      preprocessed_haystack, preprocessed_needles);
-
-    auto const check_contains = [&](auto const value_comp) {
-      if (cudf::detail::has_nested_columns(haystack) or cudf::detail::has_nested_columns(needles)) {
-        auto const d_eqcomp =
-          comparator.equal_to<true>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+  auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return lhs_index_type{idx}; });
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(auto idx) { return rhs_index_type{idx}; });
+
+  auto const helper_func =
+    [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
+      auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
+
+      auto set = cuco::experimental::static_set{
+        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{lhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        detail::hash_table_allocator_type{default_allocator<lhs_index_type>{}, stream},
+        stream.value()};
+
+      if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+        // If the haystack table has nulls but they are compared unequal, don't insert them.
+        // Otherwise, it was known to cause performance issue:
+        // - https://github.com/rapidsai/cudf/pull/6943
+        // - https://github.com/rapidsai/cudf/pull/8277
+        set.insert_if_async(haystack_iter,
+                            haystack_iter + haystack.num_rows(),
+                            thrust::counting_iterator<size_type>(0),  // stencil
+                            row_is_valid{row_bitmask_ptr},
+                            stream.value());
       } else {
-        auto const d_eqcomp =
-          comparator.equal_to<false>(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-        map.contains(needles_it,
-                     needles_it + needles.num_rows(),
-                     contained.begin(),
-                     d_hasher,
-                     d_eqcomp,
-                     stream.value());
+        set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value());
+      }
+
+      if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+        auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream);
+        auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+        set.contains_if_async(needles_iter,
+                              needles_iter + needles.num_rows(),
+                              thrust::counting_iterator<size_type>(0),  // stencil
+                              row_is_valid{row_bitmask_ptr},
+                              contained.begin(),
+                              stream.value());
+      } else {
+        set.contains_async(
+          needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value());
       }
     };
 
-    dispatch_nan_comparator(compare_nans, check_contains);
+  if (cudf::detail::has_nested_columns(haystack)) {
+    dispatch_nan_comparator<true>(compare_nulls,
+                                  compare_nans,
+                                  haystack_has_nulls,
+                                  has_any_nulls,
+                                  self_equal,
+                                  two_table_equal,
+                                  d_hasher,
+                                  helper_func);
+  } else {
+    dispatch_nan_comparator<false>(compare_nulls,
+                                   compare_nans,
+                                   haystack_has_nulls,
+                                   has_any_nulls,
+                                   self_equal,
+                                   two_table_equal,
+                                   d_hasher,
+                                   helper_func);
   }
 
   return contained;

From b25b292f7f97cbb681f0244e1a20b30a925145a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 26 Sep 2023 18:53:43 -0400
Subject: [PATCH 12/29] Add nvtext::tokenize_with_vocabulary API (#13930)

Adds tokenize with vocabulary APIs to libcudf.

```
struct tokenize_vocabulary{ ... };

std::unique_ptr<tokenize_vocabulary> load_vocabulary(
  cudf::strings_column_view const& input,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr);

std::unique_ptr<cudf::column> tokenize_with_vocabulary(
  cudf::strings_column_view const& input,
  tokenize_vocabulary const& vocabulary,
  cudf::string_scalar const& delimiter,
  cudf::size_type default_id,
  rmm::cuda_stream_view stream,
  rmm::mr::device_memory_resource* mr);
```

Returns an integer lists column replacing individual tokens as resolved from the `input` using `delimiter` with id values which are the row indices of the input `vocabulary` column.
If a token is not found in the `vocabulary` it is assigned `default_id`.
The vocabulary can be loaded once using the `nvtext::load_vocabulary()` API and then used in repeated calls to `nvtext::tokenize_with_vocabulary()` with different input columns.

Python interface is new class `TokenizeVocabulary` which can be used like the following:
```
>>> import cudf
>>> from cudf.core.tokenize_vocabulary import TokenizeVocabulary
>>> words = cudf.Series( ['brown', 'the', 'dog', 'jumps'] )
>>> vocab = TokenizeVocabulary(words)
>>> s = cudf.Series( ['the brown dog jumps over the brown cat'] )
>>> print(vocab(s))
0    [1, 0, 2, 3, -1, 1, 0, -1]
dtype: list
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13930
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/nvtext/tokenize.hpp               |  78 ++++++
 cpp/src/text/vocabulary_tokenize.cu           | 257 ++++++++++++++++++
 cpp/tests/text/tokenize_tests.cpp             |  93 +++++--
 python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd |  17 +-
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  40 ++-
 python/cudf/cudf/_lib/strings/__init__.py     |   1 +
 python/cudf/cudf/core/tokenize_vocabulary.py  |  48 ++++
 .../cudf/cudf/tests/text/test_text_methods.py |  59 ++++
 9 files changed, 574 insertions(+), 20 deletions(-)
 create mode 100644 cpp/src/text/vocabulary_tokenize.cu
 create mode 100644 python/cudf/cudf/core/tokenize_vocabulary.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a84f7bd5224..9656bc40fd7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -618,6 +618,7 @@ add_library(
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
+  src/text/vocabulary_tokenize.cu
   src/transform/bools_to_mask.cu
   src/transform/compute_column.cu
   src/transform/encode.cu
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index a72f7dcfa59..44f8f44557c 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -215,5 +215,83 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
+ *
+ * Use nvtext::load_vocabulary to create this object.
+ */
+struct tokenize_vocabulary {
+  /**
+   * @brief Vocabulary object constructor
+   *
+   * Token ids are the row indices within the vocabulary column.
+   * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+   *
+   * @throw cudf::logic_error if `vocabulary` contains nulls or is empty
+   *
+   * @param input Strings for the vocabulary
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  tokenize_vocabulary(cudf::strings_column_view const& input,
+                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  ~tokenize_vocabulary();
+
+  struct tokenize_vocabulary_impl;
+  tokenize_vocabulary_impl* _impl{};
+};
+
+/**
+ * @brief Create a tokenize_vocabulary object from a strings column
+ *
+ * Token ids are the row indices within the vocabulary column.
+ * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+ *
+ * @throw cudf::logic_error if `vocabulary` contains nulls or is empty
+ *
+ * @param input Strings for the vocabulary
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::tokenize_with_vocabulary
+ */
+std::unique_ptr<tokenize_vocabulary> load_vocabulary(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the token ids for the input string by looking up each delimited
+ * token in the given vocabulary
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["hello world", "hello there", "there there world", "watch out world"]
+ * v = load_vocabulary(["hello", "there", "world"])
+ * r = tokenize_with_vocabulary(s,v)
+ * r is now [[0,2], [0,1], [1,1,2], [-1,-1,2]]
+ * @endcode
+ *
+ * Any null row entry results in a corresponding null entry in the output
+ *
+ * @throw cudf::logic_error if `delimiter` is invalid
+ *
+ * @param input Strings column to tokenize
+ * @param vocabulary Used to lookup tokens within
+ * @param delimiter Used to identify tokens within `input`
+ * @param default_id The token id to be used for tokens not found in the `vocabulary`;
+ *                   Default is -1
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column of token ids
+ */
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(
+  cudf::strings_column_view const& input,
+  tokenize_vocabulary const& vocabulary,
+  cudf::string_scalar const& delimiter,
+  cudf::size_type default_id          = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of tokenize group
 }  // namespace nvtext
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
new file mode 100644
index 00000000000..f998c9ec239
--- /dev/null
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
+
+#include <cuco/static_map.cuh>
+
+namespace nvtext {
+namespace detail {
+namespace {
+
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
+
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct vocab_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct vocab_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return lhs == rhs;  // all rows are expected to be unique
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == rhs;
+  }
+};
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+using probe_scheme              = cuco::experimental::linear_probing<1, vocab_hasher>;
+using vocabulary_map_type       = cuco::experimental::static_map<cudf::size_type,
+                                                           cudf::size_type,
+                                                           cuco::experimental::extent<std::size_t>,
+                                                           cuda::thread_scope_device,
+                                                           vocab_equal,
+                                                           probe_scheme,
+                                                           hash_table_allocator_type>;
+}  // namespace
+}  // namespace detail
+
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a maintainable way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
+struct tokenize_vocabulary::tokenize_vocabulary_impl {
+  std::unique_ptr<cudf::column> const vocabulary;
+  col_device_view const d_vocabulary;
+  std::unique_ptr<detail::vocabulary_map_type> vocabulary_map;
+
+  auto get_map_ref() const { return vocabulary_map->ref(cuco::experimental::op::find); }
+
+  tokenize_vocabulary_impl(std::unique_ptr<cudf::column>&& vocab,
+                           col_device_view&& d_vocab,
+                           std::unique_ptr<detail::vocabulary_map_type>&& map)
+    : vocabulary(std::move(vocab)), d_vocabulary(std::move(d_vocab)), vocabulary_map(std::move(map))
+  {
+  }
+};
+
+struct key_pair {
+  __device__ auto operator()(cudf::size_type idx) const noexcept
+  {
+    return cuco::make_pair(idx, idx);
+  }
+};
+
+tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty");
+  CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls");
+
+  // need to hold a copy of the input
+  auto vocabulary   = std::make_unique<cudf::column>(input.parent(), stream, mr);
+  auto d_vocabulary = cudf::column_device_view::create(vocabulary->view(), stream);
+
+  auto vocab_map = std::make_unique<detail::vocabulary_map_type>(
+    static_cast<size_t>(vocabulary->size() * 2),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    detail::vocab_equal{*d_vocabulary},
+    detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  // the row index is the token id (value for each key in the map)
+  auto iter = cudf::detail::make_counting_transform_iterator(0, key_pair{});
+  vocab_map->insert_async(iter, iter + vocabulary->size(), stream.value());
+
+  _impl = new tokenize_vocabulary_impl(
+    std::move(vocabulary), std::move(d_vocabulary), std::move(vocab_map));
+}
+tokenize_vocabulary::~tokenize_vocabulary() { delete _impl; }
+
+std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view const& input,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<tokenize_vocabulary>(input, stream, mr);
+}
+
+namespace detail {
+namespace {
+
+/**
+ * @brief Tokenizes each string and uses the map to assign token id values
+ *
+ * @tparam MapRefType Type of the static_map reference for calling find()
+ */
+template <typename MapRefType>
+struct vocabulary_tokenizer_fn {
+  cudf::column_device_view const d_strings;
+  cudf::string_view const d_delimiter;
+  MapRefType d_map;
+  cudf::size_type const default_id;
+  cudf::size_type const* d_offsets;
+  cudf::size_type* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    characters_tokenizer tokenizer(d_str, d_delimiter);
+    auto d_tokens = d_results + d_offsets[idx];
+
+    cudf::size_type token_idx = 0;
+    while (tokenizer.next_token()) {
+      auto const pos   = tokenizer.token_byte_positions();
+      auto const token = cudf::string_view{d_str.data() + pos.first, (pos.second - pos.first)};
+      // lookup token in map
+      auto const itr = d_map.find(token);
+      auto const id  = (itr != d_map.end()) ? itr->second : default_id;
+      // set value into the output
+      d_tokens[token_idx++] = id;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
+                                                       tokenize_vocabulary const& vocabulary,
+                                                       cudf::string_scalar const& delimiter,
+                                                       cudf::size_type default_id,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  // count the tokens per string and build the offsets from the counts
+  auto const d_strings   = cudf::column_device_view::create(input.parent(), stream);
+  auto const d_delimiter = delimiter.value(stream);
+  auto const sizes_itr =
+    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
+  auto [token_offsets, total_count] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+
+  // build the output column to hold all the token ids
+  auto tokens =
+    cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto map_ref   = vocabulary._impl->get_map_ref();
+  auto d_offsets = token_offsets->view().data<cudf::size_type>();
+  auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
+  vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
+    *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator<cudf::size_type>(0),
+                     input.size(),
+                     tokenizer);
+
+  return cudf::make_lists_column(input.size(),
+                                 std::move(token_offsets),
+                                 std::move(tokens),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                 stream,
+                                 mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
+                                                       tokenize_vocabulary const& vocabulary,
+                                                       cudf::string_scalar const& delimiter,
+                                                       cudf::size_type default_id,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::tokenize_with_vocabulary(input, vocabulary, delimiter, default_id, stream, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 14fc4f8c6db..d78f2dfbdf3 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <nvtext/tokenize.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -125,29 +127,37 @@ TEST_F(TextTokenizeTest, CharacterTokenize)
 
 TEST_F(TextTokenizeTest, TokenizeEmptyTest)
 {
-  auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  cudf::test::strings_column_wrapper all_empty({"", "", ""});
-  cudf::test::strings_column_wrapper all_null({"", "", ""}, {0, 0, 0});
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 0});
-
-  auto results = nvtext::tokenize(cudf::strings_column_view(strings->view()));
+  auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view  = cudf::strings_column_view(input->view());
+  cudf::test::strings_column_wrapper all_empty_wrapper({"", "", ""});
+  auto all_empty = cudf::strings_column_view(all_empty_wrapper);
+  cudf::test::strings_column_wrapper all_null_wrapper({"", "", ""}, {0, 0, 0});
+  auto all_null = cudf::strings_column_view(all_null_wrapper);
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0});
+
+  auto results = nvtext::tokenize(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::tokenize(cudf::strings_column_view(all_empty));
+  results = nvtext::tokenize(all_empty);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::tokenize(cudf::strings_column_view(all_null));
+  results = nvtext::tokenize(all_null);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::count_tokens(cudf::strings_column_view(strings->view()));
+  results = nvtext::count_tokens(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::count_tokens(cudf::strings_column_view(all_empty));
+  results = nvtext::count_tokens(all_empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = nvtext::count_tokens(cudf::strings_column_view(all_null));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = nvtext::character_tokenize(cudf::strings_column_view(strings->view()));
+  results = nvtext::character_tokenize(view);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::character_tokenize(cudf::strings_column_view(all_empty));
+  results = nvtext::character_tokenize(all_empty);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::character_tokenize(cudf::strings_column_view(all_null));
+  results = nvtext::character_tokenize(all_null);
   EXPECT_EQ(results->size(), 0);
+  auto const delimiter = cudf::string_scalar{""};
+  results              = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter);
+  EXPECT_EQ(results->size(), 0);
+  results = nvtext::tokenize_with_vocabulary(all_null, all_empty, delimiter);
+  EXPECT_EQ(results->size(), results->null_count());
 }
 
 TEST_F(TextTokenizeTest, Detokenize)
@@ -191,3 +201,50 @@ TEST_F(TextTokenizeTest, DetokenizeErrors)
   EXPECT_THROW(nvtext::detokenize(strings_view, one, cudf::string_scalar("", false)),
                cudf::logic_error);
 }
+
+TEST_F(TextTokenizeTest, Vocabulary)
+{
+  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+    {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
+
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper input({"the fox jumped over the dog",
+                                            "the dog chased the cat",
+                                            "the cat chased the mouse",
+                                            "the mousé  ate  cheese",
+                                            "",
+                                            ""},
+                                           validity);
+  auto input_view = cudf::strings_column_view(input);
+  auto delimiter  = cudf::string_scalar(" ");
+  auto default_id = -7;  // should be the token for the missing 'cat'
+  auto results    = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  LCW expected({LCW{ 9, 4, 5, 8, 9, 3},
+                LCW{ 9, 3, 1, 9,-7},
+                LCW{ 9,-7, 1, 9, 6},
+                LCW{ 9, 7, 0, 2},
+                LCW{}, LCW{}},
+                validity);
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextTokenizeTest, TokenizeErrors)
+{
+  cudf::test::strings_column_wrapper empty{};
+  cudf::strings_column_view view(empty);
+  EXPECT_THROW(nvtext::load_vocabulary(view), cudf::logic_error);
+
+  cudf::test::strings_column_wrapper vocab_nulls({""}, {0});
+  cudf::strings_column_view nulls(vocab_nulls);
+  EXPECT_THROW(nvtext::load_vocabulary(nulls), cudf::logic_error);
+
+  cudf::test::strings_column_wrapper some{"hello"};
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(some));
+  EXPECT_THROW(nvtext::tokenize_with_vocabulary(view, *vocab, cudf::string_scalar("", false)),
+               cudf::logic_error);
+}
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
index 8b80f50e381..3cc3fd6251a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
@@ -1,10 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
@@ -38,3 +39,17 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const column_view & row_indices,
         const string_scalar & separator
     ) except +
+
+    cdef struct tokenize_vocabulary "nvtext::tokenize_vocabulary":
+        pass
+
+    cdef unique_ptr[tokenize_vocabulary] load_vocabulary(
+        const column_view & strings
+    ) except +
+
+    cdef unique_ptr[column] tokenize_with_vocabulary(
+        const column_view & strings,
+        const tokenize_vocabulary & vocabulary,
+        const string_scalar & delimiter,
+        size_type default_id
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index 2bb4fa8e108..bee9d6f6c4d 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -12,9 +12,13 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
+    load_vocabulary as cpp_load_vocabulary,
     tokenize as cpp_tokenize,
+    tokenize_vocabulary as cpp_tokenize_vocabulary,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
@@ -122,3 +126,37 @@ def detokenize(Column strings, Column indices, object py_separator):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+cdef class TokenizeVocabulary:
+    cdef unique_ptr[cpp_tokenize_vocabulary] c_obj
+
+    def __cinit__(self, Column vocab):
+        cdef column_view c_vocab = vocab.view()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary(c_vocab))
+
+
+@acquire_spill_lock()
+def tokenize_with_vocabulary(Column strings,
+                             TokenizeVocabulary vocabulary,
+                             object py_delimiter,
+                             size_type default_id):
+
+    cdef DeviceScalar delimiter = py_delimiter.device_value
+    cdef column_view c_strings = strings.view()
+    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_tokenize_with_vocabulary(
+                c_strings,
+                vocabulary.c_obj.get()[0],
+                c_delimiter[0],
+                default_id
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 16875e4397e..47a194c4fda 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -23,6 +23,7 @@
     _tokenize_scalar,
     character_tokenize,
     detokenize,
+    tokenize_with_vocabulary,
 )
 from cudf._lib.strings.attributes import (
     code_points,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
new file mode 100644
index 00000000000..afb3496311b
--- /dev/null
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.tokenize import (
+    TokenizeVocabulary as cpp_tokenize_vocabulary,
+    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
+)
+
+
+class TokenizeVocabulary:
+    """
+    A vocabulary object used to tokenize input text.
+
+    Parameters
+    ----------
+    vocabulary : str
+        Strings column of vocabulary terms
+    """
+
+    def __init__(self, vocabulary: "cudf.Series"):
+        self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
+
+    def tokenize(self, text, delimiter: str = "", default_id: int = -1):
+        """
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be tokenized.
+        delimiter : str
+            Delimiter to identify tokens. Default is whitespace.
+        default_id : int
+            Value to use for tokens not found in the vocabulary.
+            Default is -1.
+
+        Returns
+        -------
+        Tokenized strings
+        """
+        if delimiter is None:
+            delimiter = ""
+        delim = cudf.Scalar(delimiter, dtype="str")
+        result = cpp_tokenize_with_vocabulary(
+            text._column, self.vocabulary, delim, default_id
+        )
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 8cda15e4acc..2241390a531 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
 
@@ -156,6 +157,64 @@ def test_token_count(delimiter, expected_token_counts):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "delimiter, input, default_id, results",
+    [
+        (
+            "",
+            "the quick brown fox jumps over the lazy brown dog",
+            99,
+            [0, 1, 2, 3, 4, 5, 0, 99, 2, 6],
+        ),
+        (
+            " ",
+            " the sable siamésé cat jumps under the brown sofa ",
+            -1,
+            [0, 7, 8, 9, 4, 10, 0, 2, 11],
+        ),
+        (
+            "_",
+            "the_quick_brown_fox_jumped__over_the_lazy_brown_dog",
+            -99,
+            [0, 1, 2, 3, -99, 5, 0, -99, 2, 6],
+        ),
+    ],
+)
+def test_tokenize_with_vocabulary(delimiter, input, default_id, results):
+    vocabulary = cudf.Series(
+        [
+            "the",
+            "quick",
+            "brown",
+            "fox",
+            "jumps",
+            "over",
+            "dog",
+            "sable",
+            "siamésé",
+            "cat",
+            "under",
+            "sofa",
+        ]
+    )
+    tokenizer = TokenizeVocabulary(vocabulary)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series(
+        [
+            cudf.Series(results, dtype=np.int32),
+            None,
+            cudf.Series([], dtype=np.int32),
+            cudf.Series(results, dtype=np.int32),
+        ]
+    )
+
+    actual = tokenizer.tokenize(strings, delimiter, default_id)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
+
+
 def test_normalize_spaces():
     strings = cudf.Series(
         [

From 31e56702fe15f44b3e849207d31d0bb79c307367 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 27 Sep 2023 09:29:35 +0530
Subject: [PATCH 13/29] Workaround for illegal instruction error in sm90 for
 warp instrinsics with mask (#14201)

Workaround for illegal instruction error in sm90 for warp instrinsics with non `0xffffffff` mask
Removed the mask, and used ~0u (`0xffffffff`) as MASK because
- all threads in warp has correct data on error since is_within_bounds==true thread update error.
- init_state is not required at last iteration only where MASK is not ~0u.

Fixes #14183

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Elias Stehle (https://github.com/elstehle)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14201
---
 cpp/src/io/utilities/data_casting.cu | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index d16237d7afe..9e5c5c76392 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -534,8 +534,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
          char_index < cudf::util::round_up_safe(in_end - in_begin, static_cast<long>(BLOCK_SIZE));
          char_index += BLOCK_SIZE) {
       bool const is_within_bounds = char_index < (in_end - in_begin);
-      auto const MASK   = is_warp ? __ballot_sync(0xffffffff, is_within_bounds) : 0xffffffff;
-      auto const c      = is_within_bounds ? in_begin[char_index] : '\0';
+      auto const c                = is_within_bounds ? in_begin[char_index] : '\0';
       auto const prev_c = (char_index > 0 and is_within_bounds) ? in_begin[char_index - 1] : '\0';
       auto const escaped_char = get_escape_char(c);
 
@@ -571,7 +570,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
         __shared__ typename SlashScan::TempStorage temp_slash[num_warps];
         SlashScan(temp_slash[warp_id]).InclusiveScan(curr, scanned, composite_op);
         is_escaping_backslash = scanned.get(init_state);
-        init_state            = __shfl_sync(MASK, is_escaping_backslash, BLOCK_SIZE - 1);
+        init_state            = __shfl_sync(~0u, is_escaping_backslash, BLOCK_SIZE - 1);
         __syncwarp();
         is_slash.shift(warp_id);
         is_slash.set_bits(warp_id, is_escaping_backslash);
@@ -604,7 +603,7 @@ __global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
       }
       // Make sure all threads have no errors before continuing
       if constexpr (is_warp) {
-        error = __any_sync(MASK, error);
+        error = __any_sync(~0u, error);
       } else {
         using ErrorReduce = cub::BlockReduce<bool, BLOCK_SIZE>;
         __shared__ typename ErrorReduce::TempStorage temp_storage_error;
@@ -932,13 +931,8 @@ std::unique_ptr<column> parse_data(
   auto str_tuples = thrust::make_transform_iterator(offset_length_begin, to_string_view_pair{data});
 
   if (col_type == cudf::data_type{cudf::type_id::STRING}) {
-    return parse_string(str_tuples,
-                        col_size,
-                        std::forward<rmm::device_buffer>(null_mask),
-                        d_null_count,
-                        options,
-                        stream,
-                        mr);
+    return parse_string(
+      str_tuples, col_size, std::move(null_mask), d_null_count, options, stream, mr);
   }
 
   auto out_col =

From cdc03a73db880e294f8c4916d942a4568a64d5db Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 27 Sep 2023 14:15:33 +0100
Subject: [PATCH 14/29] Marginally reduce memory footprint of joins (#14197)

If we drop the gather maps as soon as we are done with them, we have a little more headroom for joins that are close to hitting the device memory limit.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14197
---
 python/cudf/cudf/core/join/join.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6a6e37180ca..b94f8f583f4 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -203,6 +203,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if left_rows is not None
             else cudf.DataFrame._from_data({})
         )
+        del left_rows
         right_result = (
             self.rhs._gather(
                 GatherMap.from_column_unchecked(
@@ -213,7 +214,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if right_rows is not None
             else cudf.DataFrame._from_data({})
         )
-
+        del right_rows
         result = cudf.DataFrame._from_data(
             *self._merge_results(left_result, right_result)
         )

From ce247961216dd70f389763dc086f137c11ad7346 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 10:10:31 -0700
Subject: [PATCH 15/29] Implement `HISTOGRAM` and `MERGE_HISTOGRAM`
 aggregations (#14045)

This adds two more aggregations for groupby and reduction:
 * `HISTOGRAM`: Count the number of occurrences (aka frequency) for each element, and
 * `MERGE_HISTOGRAM`: Merge different outputs generated by `HISTOGRAM` aggregations

This is the prerequisite for implementing the exact distributed percentile aggregation (https://github.com/rapidsai/cudf/issues/13885). However, these two new aggregations may be useful in other use-cases that need to do frequency counting.

Closes https://github.com/rapidsai/cudf/issues/13885.

Merging checklist:
 * [X] Working prototypes.
 * [X] Cleanup and docs.
 * [X]  Unit test.
 * [ ] Test with spark-rapids integration tests.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14045
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/aggregation.hpp              |  22 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  60 +++
 .../cudf/detail/hash_reduce_by_row.cuh        |   4 +
 .../cudf/reduction/detail/histogram.hpp       |  57 +++
 .../reduction/detail/reduction_functions.hpp  |  27 ++
 cpp/src/aggregation/aggregation.cpp           |  42 ++
 cpp/src/groupby/groupby.cu                    |  10 +
 cpp/src/groupby/sort/aggregate.cpp            |  30 ++
 cpp/src/groupby/sort/group_histogram.cu       | 152 +++++++
 cpp/src/groupby/sort/group_reductions.hpp     |  57 ++-
 cpp/src/reductions/histogram.cu               | 273 ++++++++++++
 cpp/src/reductions/reductions.cpp             |  12 +
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/histogram_tests.cpp         | 396 ++++++++++++++++++
 cpp/tests/reductions/reduction_tests.cpp      | 207 +++++++++
 16 files changed, 1349 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/cudf/reduction/detail/histogram.hpp
 create mode 100644 cpp/src/groupby/sort/group_histogram.cu
 create mode 100644 cpp/src/reductions/histogram.cu
 create mode 100644 cpp/tests/groupby/histogram_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9656bc40fd7..ec58c391001 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -323,6 +323,7 @@ add_library(
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
+  src/groupby/sort/group_histogram.cu
   src/groupby/sort/group_m2.cu
   src/groupby/sort/group_max.cu
   src/groupby/sort/group_min.cu
@@ -471,6 +472,7 @@ add_library(
   src/reductions/all.cu
   src/reductions/any.cu
   src/reductions/collect_ops.cu
+  src/reductions/histogram.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index d319041f8b1..d458c831f19 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,7 +116,9 @@ class aggregation {
     COVARIANCE,      ///< covariance between two sets of elements
     CORRELATION,     ///< correlation between two sets of elements
     TDIGEST,         ///< create a tdigest from a set of input values
-    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
+    MERGE_TDIGEST,   ///< create a tdigest by merging multiple tdigests together
+    HISTOGRAM,       ///< compute frequency of each element
+    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation,
   };
 
   aggregation() = delete;
@@ -288,6 +290,11 @@ std::unique_ptr<Base> make_any_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_all_aggregation();
 
+/// Factory to create a HISTOGRAM aggregation
+/// @return A HISTOGRAM aggregation object
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_histogram_aggregation();
+
 /// Factory to create a SUM_OF_SQUARES aggregation
 /// @return A SUM_OF_SQUARES aggregation object
 template <typename Base = aggregation>
@@ -610,6 +617,17 @@ std::unique_ptr<Base> make_merge_sets_aggregation(
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a MERGE_HISTOGRAM aggregation
+ *
+ * Merges the results of `HISTOGRAM` aggregations on independent sets into a new `HISTOGRAM` value
+ * equivalent to if a single `HISTOGRAM` aggregation was done across all of the sets at once.
+ *
+ * @return A MERGE_HISTOGRAM aggregation object
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_merge_histogram_aggregation();
+
 /**
  * @brief Factory to create a COVARIANCE aggregation
  *
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4d3984cab93..784f05a964e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -45,6 +45,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class max_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class count_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class any_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -89,6 +91,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class covariance_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -108,6 +112,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class min_aggregation const& agg);
   virtual void visit(class max_aggregation const& agg);
   virtual void visit(class count_aggregation const& agg);
+  virtual void visit(class histogram_aggregation const& agg);
   virtual void visit(class any_aggregation const& agg);
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
@@ -130,6 +135,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class merge_histogram_aggregation const& agg);
   virtual void visit(class covariance_aggregation const& agg);
   virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
@@ -251,6 +257,25 @@ class count_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a histogram aggregation
+ */
+class histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  histogram_aggregation() : aggregation(HISTOGRAM) {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying an any aggregation
  */
@@ -972,6 +997,25 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying MERGE_HISTOGRAM aggregation
+ */
+class merge_histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  explicit merge_histogram_aggregation() : aggregation{MERGE_HISTOGRAM} {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying COVARIANCE aggregation
  */
@@ -1148,6 +1192,12 @@ struct target_type_impl<Source, aggregation::COUNT_ALL> {
   using type = size_type;
 };
 
+// Use list for HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::HISTOGRAM> {
+  using type = list_view;
+};
+
 // Computing ANY of any type, use bool accumulator
 template <typename Source>
 struct target_type_impl<Source, aggregation::ANY> {
@@ -1326,6 +1376,12 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Use list for MERGE_HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_HISTOGRAM> {
+  using type = list_view;
+};
+
 // Always use double for COVARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::COVARIANCE> {
@@ -1417,6 +1473,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::COUNT_VALID>(std::forward<Ts>(args)...);
     case aggregation::COUNT_ALL:
       return f.template operator()<aggregation::COUNT_ALL>(std::forward<Ts>(args)...);
+    case aggregation::HISTOGRAM:
+      return f.template operator()<aggregation::HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::ANY:
       return f.template operator()<aggregation::ANY>(std::forward<Ts>(args)...);
     case aggregation::ALL:
@@ -1460,6 +1518,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_HISTOGRAM:
+      return f.template operator()<aggregation::MERGE_HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::COVARIANCE:
       return f.template operator()<aggregation::COVARIANCE>(std::forward<Ts>(args)...);
     case aggregation::CORRELATION:
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 2d2b43f1d4a..f63d1922950 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -14,12 +14,15 @@
  * limitations under the License.
  */
 
+#include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -29,6 +32,7 @@
 
 namespace cudf::detail {
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 using hash_map_type =
   cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
new file mode 100644
index 00000000000..97c711fda4e
--- /dev/null
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <memory>
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+/**
+ * @brief Compute the frequency for each distinct row in the input table.
+ *
+ * @param input The input table to compute histogram
+ * @param partial_counts An optional column containing count for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate memory of the returned objects
+ * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ * table, and their corresponding distinct counts
+ */
+[[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
+compute_row_frequencies(table_view const& input,
+                        std::optional<column_view> const& partial_counts,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Create an empty histogram column.
+ *
+ * A histogram column is a structs column `STRUCT<T, int64_t>` where T is type of the input
+ * values.
+ *
+ * @returns An empty histogram column
+ */
+[[nodiscard]] std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 014a6ba70eb..704332c8e1d 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -131,6 +131,33 @@ std::unique_ptr<scalar> all(column_view const& col,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Compute frequency for each unique element in the input column.
+ *
+ * The result histogram is stored in structs column having two children. The first child contains
+ * unique elements from the input, and the second child contains their corresponding frequencies.
+ *
+ * @param input The column to compute histogram
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing a structs column as the result histogram
+ */
+std::unique_ptr<scalar> histogram(column_view const& input,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Merge multiple histograms together.
+ *
+ * @param input The input given as multiple histograms concatenated together
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing the result histogram
+ */
+std::unique_ptr<scalar> merge_histogram(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Computes product of elements in input column
  *
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 2e6a643484e..b3f2a774a60 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -64,6 +64,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, any_aggregation const& agg)
 {
@@ -196,6 +202,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, covariance_aggregation const& agg)
 {
@@ -246,6 +258,10 @@ void aggregation_finalizer::visit(count_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
+void aggregation_finalizer::visit(histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
 
 void aggregation_finalizer::visit(any_aggregation const& agg)
 {
@@ -357,6 +373,11 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(merge_histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(covariance_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -460,6 +481,16 @@ template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_agg
 template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
   null_policy null_handling);
 
+/// Factory to create a HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_histogram_aggregation()
+{
+  return std::make_unique<detail::histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a ANY aggregation
 template <typename Base>
 std::unique_ptr<Base> make_any_aggregation()
@@ -764,6 +795,17 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+/// Factory to create a MERGE_HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_histogram_aggregation()
+{
+  return std::make_unique<detail::merge_histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation>
+make_merge_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_merge_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_type ddof)
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index ce1fc71968f..e3c021eb66a 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -28,6 +28,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -110,6 +111,15 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
+    if constexpr (k == aggregation::Kind::HISTOGRAM) {
+      return make_lists_column(0,
+                               make_empty_column(type_to_id<size_type>()),
+                               cudf::reduction::detail::make_empty_histogram_like(values),
+                               0,
+                               {});
+    }
+    if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); }
+
     if constexpr (k == aggregation::Kind::RANK) {
       auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
       if (rank_agg._method == cudf::rank_method::AVERAGE or
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 3f977dc81d7..10c271f76f9 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -89,6 +89,18 @@ void aggregate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation co
     detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+template <>
+void aggregate_result_functor::operator()<aggregation::HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) return;
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_histogram(
+      get_grouped_values(), helper.group_labels(stream), helper.num_groups(stream), stream, mr));
+}
+
 template <>
 void aggregate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 {
@@ -534,6 +546,24 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+/**
+ * @brief Perform merging for multiple histograms that correspond to the same key value.
+ *
+ * The partial results input to this aggregation is a structs column that is concatenated from
+ * multiple outputs of HISTOGRAM aggregations.
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_merge_histogram(
+      get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
+}
+
 /**
  * @brief Creates column views with only valid elements in both input column views
  *
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
new file mode 100644
index 00000000000..bb70037aaef
--- /dev/null
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <lists/utilities.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/labeling/label_segments.cuh>
+#include <cudf/reduction/detail/histogram.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/gather.h>
+
+namespace cudf::groupby::detail {
+
+namespace {
+
+std::unique_ptr<column> build_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        std::optional<column_view> const& partial_counts,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
+               "Size of values column should be the same as that of group labels.",
+               std::invalid_argument);
+
+  // Attach group labels to the input values.
+  auto const labels_cv      = column_view{data_type{type_to_id<size_type>()},
+                                     static_cast<size_type>(group_labels.size()),
+                                     group_labels.data(),
+                                     nullptr,
+                                     0};
+  auto const labeled_values = table_view{{labels_cv, values}};
+
+  // Build histogram for the labeled values.
+  auto [distinct_indices, distinct_counts] =
+    cudf::reduction::detail::compute_row_frequencies(labeled_values, partial_counts, stream, mr);
+
+  // Gather the distinct rows for the output histogram.
+  auto out_table = cudf::detail::gather(labeled_values,
+                                        *distinct_indices,
+                                        out_of_bounds_policy::DONT_CHECK,
+                                        cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                        stream,
+                                        mr);
+
+  // Build offsets for the output lists column containing output histograms.
+  // Each list will be a histogram corresponding to one value group.
+  auto out_offsets = cudf::lists::detail::reconstruct_offsets(
+    out_table->get_column(0).view(), num_groups, stream, mr);
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(out_table->release().back()));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto out_structs = make_structs_column(static_cast<size_type>(distinct_indices->size()),
+                                         std::move(struct_children),
+                                         0,
+                                         {},
+                                         stream,
+                                         mr);
+
+  return make_lists_column(
+    num_groups, std::move(out_offsets), std::move(out_structs), 0, {}, stream, mr);
+}
+
+}  // namespace
+
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
+
+  return build_histogram(values, group_labels, std::nullopt, num_groups, stream, mr);
+}
+
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
+
+  // The input must be a lists column without nulls.
+  CUDF_EXPECTS(!values.has_nulls(), "The input column must not have nulls.", std::invalid_argument);
+  CUDF_EXPECTS(values.type().id() == type_id::LIST,
+               "The input of MERGE_HISTOGRAM aggregation must be a lists column.",
+               std::invalid_argument);
+
+  // Child of the input lists column must be a structs column without nulls,
+  // and its second child is a columns of integer type having no nulls.
+  auto const lists_cv     = lists_column_view{values};
+  auto const histogram_cv = lists_cv.get_sliced_child(stream);
+  CUDF_EXPECTS(!histogram_cv.has_nulls(),
+               "Child of the input lists column must not have nulls.",
+               std::invalid_argument);
+  CUDF_EXPECTS(histogram_cv.type().id() == type_id::STRUCT && histogram_cv.num_children() == 2,
+               "The input column has invalid histograms structure.",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::is_integral(histogram_cv.child(1).type()) && !histogram_cv.child(1).has_nulls(),
+    "The input column has invalid histograms structure.",
+    std::invalid_argument);
+
+  // Concatenate the histograms corresponding to the same key values.
+  // That is equivalent to creating a new lists column (view) from the input lists column
+  // with new offsets gathered as below.
+  auto new_offsets = rmm::device_uvector<size_type>(num_groups + 1, stream);
+  thrust::gather(rmm::exec_policy(stream),
+                 group_offsets.begin(),
+                 group_offsets.end(),
+                 lists_cv.offsets_begin(),
+                 new_offsets.begin());
+
+  // Generate labels for the new lists.
+  auto key_labels = rmm::device_uvector<size_type>(histogram_cv.size(), stream);
+  cudf::detail::label_segments(
+    new_offsets.begin(), new_offsets.end(), key_labels.begin(), key_labels.end(), stream);
+
+  auto const structs_cv   = structs_column_view{histogram_cv};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
+
+  return build_histogram(input_values, key_labels, input_counts, num_groups, stream, mr);
+}
+
+}  // namespace cudf::groupby::detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index fc24b679db5..3aa79f226a3 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,6 +216,33 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
+/**
+ * @brief Internal API to compute histogram for each group in @p values.
+ *
+ * The returned column is a lists column, each list corresponds to one input group and stores the
+ * histogram of the distinct elements in that group in the form of `STRUCT<value, count>`.
+ *
+ * Note that the order of distinct elements in each output list is not specified.
+ *
+ * @code{.pseudo}
+ * values       = [2, 1, 1, 3, 5, 2, 2, 3, 1, 4]
+ * group_labels = [0, 0, 0, 1, 1, 1, 1, 1, 2, 2]
+ * num_groups   = 3
+ *
+ * output = [[<1, 2>, <2, 1>], [<2, 2>, <3, 2>, <5, 1>], [<1, 1>, <4, 1>]]
+ * @endcode
+ *
+ * @param values Grouped values to compute histogram
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -441,6 +468,34 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
+ *
+ * The input values column should be given as a lists column in the form of
+ * `LIST<STRUCT<value, count>>`.
+ * After merging, the order of distinct elements in each output list is not specified.
+ *
+ * @code{.pseudo}
+ * values        = [ [<1, 2>, <2, 1>], [<2, 2>], [<3, 2>, <2, 1>], [<1, 1>, <2, 1>] ]
+ * group_offsets = [ 0,                          2,                                 4]
+ * num_groups    = 2
+ *
+ * output = [[<1, 2>, <2, 3>], [<1, 1>, <2, 2>, <3, 2>]]]
+ * @endcode
+ *
+ * @param values Grouped values to get valid count of
+ * @param group_offsets Offsets of groups' starting points within @p values
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_offsets,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
  *
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
new file mode 100644
index 00000000000..fa84bbeb25d
--- /dev/null
+++ b/cpp/src/reductions/histogram.cu
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/hash_reduce_by_row.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cuda/atomic>
+
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+namespace {
+
+// Always use 64-bit signed integer for storing count.
+using histogram_count_type = int64_t;
+
+/**
+ * @brief The functor to accumulate the frequency of each distinct rows in the input table.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
+struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
+  CountType const* d_partial_output;
+
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            CountType* const d_output,
+            CountType const* const d_partial_output)
+    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
+                                                                                   d_hasher,
+                                                                                   d_equal,
+                                                                                   d_output},
+      d_partial_output{d_partial_output}
+  {
+  }
+
+  // Count the number of rows in each group of rows that are compared equal.
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
+    auto const count =
+      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
+    count.fetch_add(increment, cuda::std::memory_order_relaxed);
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `reduce_fn` functor.
+ */
+template <typename CountType>
+struct reduce_func_builder {
+  CountType const* const d_partial_output;
+
+  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
+  {
+  }
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             CountType* const d_output)
+  {
+    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
+      d_map, d_hasher, d_equal, d_output, d_partial_output};
+  }
+};
+
+/**
+ * @brief Specialized functor to check for not-zero of the second component of the input.
+ */
+struct is_not_zero {
+  template <typename Pair>
+  __device__ bool operator()(Pair const input) const
+  {
+    return thrust::get<1>(input) != 0;
+  }
+};
+
+/**
+ * @brief Building a histogram by gathering distinct rows from the input table and their
+ * corresponding distinct counts.
+ *
+ * @param input The input table
+ * @param distinct_indices Indices of the distinct rows
+ * @param distinct_counts Distinct counts corresponding to the distinct rows
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object's device memory
+ * @return A list_scalar storing the output histogram
+ */
+auto gather_histogram(table_view const& input,
+                      device_span<size_type const> distinct_indices,
+                      std::unique_ptr<column>&& distinct_counts,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto distinct_rows = cudf::detail::gather(input,
+                                            distinct_indices,
+                                            out_of_bounds_policy::DONT_CHECK,
+                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                            stream,
+                                            mr);
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(distinct_rows->release().front()));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto output_structs = make_structs_column(
+    static_cast<size_type>(distinct_indices.size()), std::move(struct_children), 0, {}, stream, mr);
+
+  return std::make_unique<cudf::list_scalar>(
+    std::move(*output_structs.release()), true, stream, mr);
+}
+
+}  // namespace
+
+std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
+{
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(empty_like(values));
+  struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
+  return std::make_unique<column>(data_type{type_id::STRUCT},
+                                  0,
+                                  rmm::device_buffer{},
+                                  rmm::device_buffer{},
+                                  0,
+                                  std::move(struct_children));
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>>
+compute_row_frequencies(table_view const& input,
+                        std::optional<column_view> const& partial_counts,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
+
+  // Nested types are not tested, thus we just throw exception if we see such input for now.
+  // We should remove this check after having enough tests.
+  CUDF_EXPECTS(!has_nested_columns,
+               "Nested types are not yet supported in histogram aggregation.",
+               std::invalid_argument);
+
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
+    cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value()};
+
+  auto const preprocessed_input =
+    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
+  auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
+
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });
+
+  // Always compare NaNs as equal.
+  using nan_equal_comparator =
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+  auto const value_comp = nan_equal_comparator{};
+
+  if (has_nested_columns) {
+    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  } else {
+    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  }
+
+  // Gather the indices of distinct rows.
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<size_type>(map.get_size()), stream, mr);
+
+  // Store the number of occurrences of each distinct row.
+  auto distinct_counts = make_numeric_column(data_type{type_to_id<histogram_count_type>()},
+                                             static_cast<size_type>(map.get_size()),
+                                             mask_state::UNALLOCATED,
+                                             stream,
+                                             mr);
+
+  // Compute frequencies (aka distinct counts) for the input rows.
+  // Note that we consider null and NaNs as always equal.
+  auto const reduction_results = cudf::detail::hash_reduce_by_row(
+    map,
+    preprocessed_input,
+    input.num_rows(),
+    has_nulls,
+    has_nested_columns,
+    null_equality::EQUAL,
+    nan_equality::ALL_EQUAL,
+    reduce_func_builder<histogram_count_type>{
+      partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
+    histogram_count_type{0},
+    stream,
+    rmm::mr::get_current_device_resource());
+
+  auto const input_it = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
+  auto const output_it = thrust::make_zip_iterator(thrust::make_tuple(
+    distinct_indices->begin(), distinct_counts->mutable_view().begin<histogram_count_type>()));
+
+  // Reduction results above are either group sizes of equal rows, or `0`.
+  // The final output is non-zero group sizes only.
+  thrust::copy_if(
+    rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{});
+
+  return {std::move(distinct_indices), std::move(distinct_counts)};
+}
+
+std::unique_ptr<cudf::scalar> histogram(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
+
+  auto const input_tv = table_view{{input}};
+  auto [distinct_indices, distinct_counts] =
+    compute_row_frequencies(input_tv, std::nullopt, stream, mr);
+  return gather_histogram(input_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
+}
+
+std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
+  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.", std::invalid_argument);
+  CUDF_EXPECTS(input.type().id() == type_id::STRUCT && input.num_children() == 2,
+               "The input must be a structs column having two children.",
+               std::invalid_argument);
+  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
+               "The second child of the input column must be of integral type and without nulls.",
+               std::invalid_argument);
+
+  auto const structs_cv   = structs_column_view{input};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
+
+  auto const values_tv = table_view{{input_values}};
+  auto [distinct_indices, distinct_counts] =
+    compute_row_frequencies(values_tv, input_counts, stream, mr);
+  return gather_histogram(values_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
+}
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 2fef8aa8785..23171baaa45 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -23,6 +23,7 @@
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -59,6 +60,8 @@ struct reduce_dispatch_functor {
       case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
       case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
       case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
+      case aggregation::HISTOGRAM: return histogram(col, stream, mr);
+      case aggregation::MERGE_HISTOGRAM: return merge_histogram(col, stream, mr);
       case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
       case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
@@ -165,6 +168,15 @@ std::unique_ptr<scalar> reduce(column_view const& col,
       return tdigest::detail::make_empty_tdigest_scalar(stream, mr);
     }
 
+    if (agg.kind == aggregation::HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col)), true, stream, mr);
+    }
+    if (agg.kind == aggregation::MERGE_HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col.child(0))), true, stream, mr);
+    }
+
     if (output_dtype.id() == type_id::LIST) {
       if (col.type() == output_dtype) { return make_empty_scalar_like(col, stream, mr); }
       // Under some circumstance, the output type will become the List of input type,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 68ff6c54c99..04939f3cd6d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -116,6 +116,7 @@ ConfigureTest(
   groupby/covariance_tests.cpp
   groupby/groupby_test_util.cpp
   groupby/groups_tests.cpp
+  groupby/histogram_tests.cpp
   groupby/keys_tests.cpp
   groupby/lists_tests.cpp
   groupby/m2_tests.cpp
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
new file mode 100644
index 00000000000..c5833f40cf2
--- /dev/null
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/lists/sorting.hpp>
+#include <cudf/sorting.hpp>
+
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+using structs_col = cudf::test::structs_column_wrapper;
+
+auto groupby_histogram(cudf::column_view const& keys,
+                       cudf::column_view const& values,
+                       cudf::aggregation::Kind agg_kind)
+{
+  CUDF_EXPECTS(
+    agg_kind == cudf::aggregation::HISTOGRAM || agg_kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = values;
+  if (agg_kind == cudf::aggregation::HISTOGRAM) {
+    requests[0].aggregations.push_back(
+      cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+  } else {
+    requests[0].aggregations.push_back(
+      cudf::make_merge_histogram_aggregation<cudf::groupby_aggregation>());
+  }
+
+  auto gb_obj              = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto const agg_results   = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  auto const agg_histogram = agg_results.second[0].results[0]->view();
+  EXPECT_EQ(agg_histogram.type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(agg_histogram.null_count(), 0);
+
+  auto const histograms = cudf::lists_column_view{agg_histogram}.child();
+  EXPECT_EQ(histograms.num_children(), 2);
+  EXPECT_EQ(histograms.null_count(), 0);
+  EXPECT_EQ(histograms.child(1).null_count(), 0);
+
+  auto const key_sort_order = cudf::sorted_order(agg_results.first->view(), {}, {});
+  auto sorted_keys =
+    std::move(cudf::gather(agg_results.first->view(), *key_sort_order)->release().front());
+  auto const sorted_vals =
+    std::move(cudf::gather(cudf::table_view{{agg_histogram}}, *key_sort_order)->release().front());
+  auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
+                                                   cudf::order::ASCENDING,
+                                                   cudf::null_order::BEFORE,
+                                                   rmm::mr::get_current_device_resource());
+
+  return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
+}
+
+template <typename T>
+struct GroupbyHistogramTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct GroupbyMergeHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(GroupbyHistogramTest, HistogramTestTypes);
+TYPED_TEST_SUITE(GroupbyMergeHistogramTest, HistogramTestTypes);
+
+TYPED_TEST(GroupbyHistogramTest, EmptyInput)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys   = int32s_col{};
+  auto const values = col_data{};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys   = int32s_col{2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys_original = int32s_col{2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values_original =
+    col_data{1, 2, 0, 2, 1, -3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 20})[0];
+  auto const values = cudf::slice(values_original, {5, 20})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys = int32s_col{2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values =
+    col_data{{null, -3, 2, 1, 2, null, 0, 5, 2, null, -3, -2, 2, null, 1, 2, null, 1, 2, 2},
+             nulls_at({0, 5, 9, 13, 16})};
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  auto const keys_original =
+    int32s_col{1, 0, 2, 2, 0, 2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 0, 1, 2};
+  auto const values_original =
+    col_data{{null, 1,  1,  2, 1,    null, -3, 2,    1, 2, null, 0,    5, 2,
+              null, -3, -2, 2, null, 1,    2,  null, 1, 2, 2,    null, 1, 2},
+             nulls_at({0, 5, 10, 14, 18, 21, 25})};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 25})[0];
+  auto const values = cudf::slice(values_original, {5, 25})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, EmptyInput)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys   = int32s_col{};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{};
+      auto counts = int64s_col{};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      0, int32s_col{}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 0, 1, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 3, 6, 8, 11, 14}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 0, 1, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{0, 2, -3, 1, -3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 7, 10, 12, 15, 18}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 1, 0, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1}, nulls_at({0, 2, 8})};
+      auto counts = int64s_col{1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 2, 5, 8, 10, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 1, 0, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1},
+                             nulls_at({0, 2, 4, 6, 12})};
+      auto counts = int64s_col{1, 3, 2, 1, 1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 6, 9, 12, 14, 16}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
+
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2561f3f9886..7644ac48892 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -379,6 +380,212 @@ TYPED_TEST(ReductionTest, SumOfSquare)
             expected_null_value);
 }
 
+auto histogram_reduction(cudf::column_view const& input,
+                         std::unique_ptr<cudf::reduce_aggregation> const& agg)
+{
+  CUDF_EXPECTS(
+    agg->kind == cudf::aggregation::HISTOGRAM || agg->kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  auto const result_scalar = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+  EXPECT_EQ(result_scalar->is_valid(), true);
+
+  auto const result_list_scalar = dynamic_cast<cudf::list_scalar*>(result_scalar.get());
+  EXPECT_NE(result_list_scalar, nullptr);
+
+  auto const histogram = result_list_scalar->view();
+  EXPECT_EQ(histogram.num_children(), 2);
+  EXPECT_EQ(histogram.null_count(), 0);
+  EXPECT_EQ(histogram.child(1).null_count(), 0);
+
+  // Sort the histogram based on the first column (unique input values).
+  auto const sort_order = cudf::sorted_order(cudf::table_view{{histogram.child(0)}}, {}, {});
+  return std::move(cudf::gather(cudf::table_view{{histogram}}, *sort_order)->release().front());
+}
+
+template <typename T>
+struct ReductionHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
+
+TYPED_TEST(ReductionHistogramTest, Histogram)
+{
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_histogram_aggregation<reduce_aggregation>();
+
+  // Empty input.
+  {
+    auto const input    = data_col{};
+    auto const expected = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  {
+    auto const input    = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const expected = [] {
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const input          = cudf::slice(input_original, {0, 7})[0];
+    auto const expected       = [] {
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+  {
+    auto const input    = data_col{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
+                                nulls_at({0, 6, 9, 12})};
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{4, 2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls, sliced input.
+  {
+    auto const input_original = data_col{
+      {null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1}, nulls_at({0, 6, 9, 12})};
+    auto const input    = cudf::slice(input_original, {0, 9})[0];
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{2, 1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TYPED_TEST(ReductionHistogramTest, MergeHistogram)
+{
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
+
+  // Empty input.
+  {
+    auto const input = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const expected = [] {
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls.
+  {
+    auto const input = [] {
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 7})[0];
+
+    auto const expected = [] {
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 4, 1, 5, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+  {
+    auto const input = [] {
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{67, 5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 9})[0];
+
+    auto const expected = [] {
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{33, 2, 4, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
 template <typename T>
 struct ReductionAnyAllTest : public ReductionTest<bool> {};
 using AnyAllTypes = cudf::test::Types<int32_t, float, bool>;

From a97020f9c7e4e2be86788b5f7d83608839d3207b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 27 Sep 2023 13:33:48 -0400
Subject: [PATCH 16/29] Correct numerous 20054-D: dynamic initialization errors
 found on arm+12.2 (#14108)

Compile issues found by compiling libcudf with the `rapidsai/devcontainers:23.10-cpp-gcc9-cuda12.2-ubuntu20.04` docker container.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14108
---
 cpp/src/io/avro/avro_common.hpp             |  3 +-
 cpp/src/io/comp/unsnap.cu                   | 18 +++---
 cpp/src/io/orc/orc_gpu.hpp                  | 39 +++++-------
 cpp/src/io/orc/stats_enc.cu                 | 10 +--
 cpp/src/io/orc/stripe_init.cu               | 29 ++++-----
 cpp/src/io/parquet/page_decode.cuh          | 67 +++++++++++----------
 cpp/src/io/parquet/page_hdr.cu              | 12 ++--
 cpp/src/io/parquet/parquet_gpu.hpp          | 56 ++++++++---------
 cpp/src/io/statistics/column_statistics.cuh | 12 ++--
 cpp/src/io/statistics/statistics.cuh        | 30 ++++-----
 10 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index ff8ee206dd4..0058d236d8c 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -25,7 +25,8 @@ namespace cudf {
 namespace io {
 namespace avro {
 struct block_desc_s {
-  block_desc_s() {}
+  block_desc_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   explicit constexpr block_desc_s(
     size_t offset_, uint32_t size_, uint32_t row_offset_, uint32_t first_row_, uint32_t num_rows_)
     : offset(offset_),
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index c699502317f..504a2fe377c 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -52,6 +52,8 @@ struct unsnap_batch_s {
  * @brief Queue structure used to exchange data between warps
  */
 struct unsnap_queue_s {
+  unsnap_queue_s() = default;  // required to compile on ctk-12.2 + aarch64
+
   uint32_t prefetch_wrpos;         ///< Prefetcher write position
   uint32_t prefetch_rdpos;         ///< Prefetch consumer read position
   int32_t prefetch_end;            ///< Prefetch enable flag (nonzero stops prefetcher)
@@ -64,13 +66,15 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  uint8_t const* base;             ///< base ptr of compressed stream
-  uint8_t const* end;              ///< end of compressed stream
-  uint32_t uncompressed_size;      ///< uncompressed stream size
-  uint32_t bytes_left;             ///< remaining bytes to decompress
-  int32_t error;                   ///< current error status
-  uint32_t tstart;                 ///< start time for perf logging
-  volatile unsnap_queue_s q;       ///< queue for cross-warp communication
+  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+
+  uint8_t const* base{};           ///< base ptr of compressed stream
+  uint8_t const* end{};            ///< end of compressed stream
+  uint32_t uncompressed_size{};    ///< uncompressed stream size
+  uint32_t bytes_left{};           ///< remaining bytes to decompress
+  int32_t error{};                 ///< current error status
+  uint32_t tstart{};               ///< start time for perf logging
+  volatile unsnap_queue_s q{};     ///< queue for cross-warp communication
   device_span<uint8_t const> src;  ///< input for current block
   device_span<uint8_t> dst;        ///< output for current block
 };
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 9b8df50a22a..dba7a9ffda5 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -59,31 +59,24 @@ struct CompressedStreamInfo {
   explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
       uncompressed_data(nullptr),
-      compressed_data_size(compressed_size_),
-      dec_in_ctl(nullptr),
-      dec_out_ctl(nullptr),
-      copy_in_ctl(nullptr),
-      copy_out_ctl(nullptr),
-      num_compressed_blocks(0),
-      num_uncompressed_blocks(0),
-      max_uncompressed_size(0),
-      max_uncompressed_block_size(0)
+      compressed_data_size(compressed_size_)
   {
   }
-  uint8_t const* compressed_data;  // [in] base ptr to compressed stream data
-  uint8_t* uncompressed_data;  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size;              // [in] compressed data size for this stream
-  device_span<uint8_t const>* dec_in_ctl;   // [in] input buffer to decompress
-  device_span<uint8_t>* dec_out_ctl;        // [in] output buffer to decompress into
-  device_span<compression_result> dec_res;  // [in] results of decompression
-  device_span<uint8_t const>* copy_in_ctl;  // [out] input buffer to copy
-  device_span<uint8_t>* copy_out_ctl;       // [out] output buffer to copy to
-  uint32_t num_compressed_blocks;  // [in,out] number of entries in decctl(in), number of compressed
-                                   // blocks(out)
-  uint32_t num_uncompressed_blocks;      // [in,out] number of entries in dec_in_ctl(in), number of
-                                         // uncompressed blocks(out)
-  uint64_t max_uncompressed_size;        // [out] maximum uncompressed data size of stream
-  uint32_t max_uncompressed_block_size;  // [out] maximum uncompressed size of any block in stream
+  uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
+  uint8_t*
+    uncompressed_data{};  // [in] base ptr to uncompressed stream data or NULL if not known yet
+  size_t compressed_data_size{};              // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl{};   // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl{};        // [in] output buffer to decompress into
+  device_span<compression_result> dec_res{};  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl{};  // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl{};       // [out] output buffer to copy to
+  uint32_t num_compressed_blocks{};           // [in,out] number of entries in decctl(in), number of
+                                              // compressed blocks(out)
+  uint32_t num_uncompressed_blocks{};  // [in,out] number of entries in dec_in_ctl(in), number of
+                                       // uncompressed blocks(out)
+  uint64_t max_uncompressed_size{};    // [out] maximum uncompressed data size of stream
+  uint32_t max_uncompressed_block_size{};  // [out] maximum uncompressed size of any block in stream
 };
 
 enum StreamIndexType {
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 69d7ec95acd..95f1db5bfd1 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -134,11 +134,11 @@ __global__ void __launch_bounds__(block_size, 1)
 }
 
 struct stats_state_s {
-  uint8_t* base;  ///< Output buffer start
-  uint8_t* end;   ///< Output buffer end
-  statistics_chunk chunk;
-  statistics_merge_group group;
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
+  uint8_t* base{};  ///< Output buffer start
+  uint8_t* end{};   ///< Output buffer end
+  statistics_chunk chunk{};
+  statistics_merge_group group{};
+  statistics_dtype stats_dtype{};  //!< Statistics data type for this column
 };
 
 /*
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index d8a60350356..8eeca504121 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -30,14 +30,14 @@ namespace orc {
 namespace gpu {
 
 struct comp_in_out {
-  uint8_t const* in_ptr;
-  size_t in_size;
-  uint8_t* out_ptr;
-  size_t out_size;
+  uint8_t const* in_ptr{};
+  size_t in_size{};
+  uint8_t* out_ptr{};
+  size_t out_size{};
 };
 struct compressed_stream_s {
-  CompressedStreamInfo info;
-  comp_in_out ctl;
+  CompressedStreamInfo info{};
+  comp_in_out ctl{};
 };
 
 // blockDim {128,1,1}
@@ -208,14 +208,15 @@ __global__ void __launch_bounds__(128, 8)
  * @brief Shared mem state for gpuParseRowGroupIndex
  */
 struct rowindex_state_s {
-  ColumnDesc chunk;
-  uint32_t rowgroup_start;
-  uint32_t rowgroup_end;
-  int is_compressed;
-  uint32_t row_index_entry[3][CI_PRESENT];  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
-  CompressedStreamInfo strm_info[2];
-  RowGroup rowgroups[128];
-  uint32_t compressed_offset[128][2];
+  ColumnDesc chunk{};
+  uint32_t rowgroup_start{};
+  uint32_t rowgroup_end{};
+  int is_compressed{};
+  uint32_t row_index_entry[3]
+                          [CI_PRESENT]{};  // NOTE: Assumes CI_PRESENT follows CI_DATA and CI_DATA2
+  CompressedStreamInfo strm_info[2]{};
+  RowGroup rowgroups[128]{};
+  uint32_t compressed_offset[128][2]{};
 };
 
 enum row_entry_state_e {
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 26e3c951b2e..5e66885d746 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -26,48 +26,49 @@
 namespace cudf::io::parquet::gpu {
 
 struct page_state_s {
-  uint8_t const* data_start;
-  uint8_t const* data_end;
-  uint8_t const* lvl_end;
-  uint8_t const* dict_base;    // ptr to dictionary page data
-  int32_t dict_size;           // size of dictionary data
-  int32_t first_row;           // First row in page to output
-  int32_t num_rows;            // Rows in page to decode (including rows to be skipped)
-  int32_t first_output_value;  // First value in page to output
-  int32_t num_input_values;    // total # of input/level values in the page
-  int32_t dtype_len;           // Output data type length
-  int32_t dtype_len_in;        // Can be larger than dtype_len if truncating 32-bit into 8-bit
-  int32_t dict_bits;           // # of bits to store dictionary indices
-  uint32_t dict_run;
-  int32_t dict_val;
-  uint32_t initial_rle_run[NUM_LEVEL_TYPES];   // [def,rep]
-  int32_t initial_rle_value[NUM_LEVEL_TYPES];  // [def,rep]
-  int32_t error;
-  PageInfo page;
-  ColumnChunkDesc col;
+  constexpr page_state_s() noexcept {}
+  uint8_t const* data_start{};
+  uint8_t const* data_end{};
+  uint8_t const* lvl_end{};
+  uint8_t const* dict_base{};    // ptr to dictionary page data
+  int32_t dict_size{};           // size of dictionary data
+  int32_t first_row{};           // First row in page to output
+  int32_t num_rows{};            // Rows in page to decode (including rows to be skipped)
+  int32_t first_output_value{};  // First value in page to output
+  int32_t num_input_values{};    // total # of input/level values in the page
+  int32_t dtype_len{};           // Output data type length
+  int32_t dtype_len_in{};        // Can be larger than dtype_len if truncating 32-bit into 8-bit
+  int32_t dict_bits{};           // # of bits to store dictionary indices
+  uint32_t dict_run{};
+  int32_t dict_val{};
+  uint32_t initial_rle_run[NUM_LEVEL_TYPES]{};   // [def,rep]
+  int32_t initial_rle_value[NUM_LEVEL_TYPES]{};  // [def,rep]
+  int32_t error{};
+  PageInfo page{};
+  ColumnChunkDesc col{};
 
   // (leaf) value decoding
-  int32_t nz_count;  // number of valid entries in nz_idx (write position in circular buffer)
-  int32_t dict_pos;  // write position of dictionary indices
-  int32_t src_pos;   // input read position of final output value
-  int32_t ts_scale;  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
+  int32_t nz_count{};  // number of valid entries in nz_idx (write position in circular buffer)
+  int32_t dict_pos{};  // write position of dictionary indices
+  int32_t src_pos{};   // input read position of final output value
+  int32_t ts_scale{};  // timestamp scale: <0: divide by -ts_scale, >0: multiply by ts_scale
 
   // repetition/definition level decoding
-  int32_t input_value_count;                  // how many values of the input we've processed
-  int32_t input_row_count;                    // how many rows of the input we've processed
-  int32_t input_leaf_count;                   // how many leaf values of the input we've processed
-  uint8_t const* lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES];  // [def,rep]
-  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES];    // [def,rep]
-  int32_t lvl_count[NUM_LEVEL_TYPES];             // how many of each of the streams we've decoded
-  int32_t row_index_lower_bound;                  // lower bound of row indices we should process
+  int32_t input_value_count{};                  // how many values of the input we've processed
+  int32_t input_row_count{};                    // how many rows of the input we've processed
+  int32_t input_leaf_count{};                   // how many leaf values of the input we've processed
+  uint8_t const* lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_start[NUM_LEVEL_TYPES]{};  // [def,rep]
+  uint8_t const* abs_lvl_end[NUM_LEVEL_TYPES]{};    // [def,rep]
+  int32_t lvl_count[NUM_LEVEL_TYPES]{};             // how many of each of the streams we've decoded
+  int32_t row_index_lower_bound{};                  // lower bound of row indices we should process
 
   // a shared-memory cache of frequently used data when decoding. The source of this data is
   // normally stored in global memory which can yield poor performance. So, when possible
   // we copy that info here prior to decoding
-  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info];
+  PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]{};
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
-  PageNestingDecodeInfo* nesting_info;
+  PageNestingDecodeInfo* nesting_info{};
 };
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 0d611643b46..6f8b2f50443 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -45,13 +45,13 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
                                                                   ST_FLD_LIST};
 
 struct byte_stream_s {
-  uint8_t const* cur;
-  uint8_t const* end;
-  uint8_t const* base;
+  uint8_t const* cur{};
+  uint8_t const* end{};
+  uint8_t const* base{};
   // Parsed symbols
-  PageType page_type;
-  PageInfo page;
-  ColumnChunkDesc ck;
+  PageType page_type{};
+  PageInfo page{};
+  ColumnChunkDesc ck{};
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a3cc37dee4f..a760c2448dc 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -228,7 +228,7 @@ struct PageInfo {
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  ColumnChunkDesc() = default;
+  constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -275,34 +275,34 @@ struct ColumnChunkDesc {
   {
   }
 
-  uint8_t const* compressed_data;                  // pointer to compressed column chunk data
-  size_t compressed_size;                          // total compressed data size for this chunk
-  size_t num_values;                               // total number of values in this column
-  size_t start_row;                                // starting row of this chunk
-  uint32_t num_rows;                               // number of rows in this chunk
-  int16_t max_level[level_type::NUM_LEVEL_TYPES];  // max definition/repetition level
-  int16_t max_nesting_depth;                       // max nesting depth of the output
-  uint16_t data_type;                              // basic column data type, ((type_length << 3) |
-                                                   // parquet::Type)
+  uint8_t const* compressed_data{};                  // pointer to compressed column chunk data
+  size_t compressed_size{};                          // total compressed data size for this chunk
+  size_t num_values{};                               // total number of values in this column
+  size_t start_row{};                                // starting row of this chunk
+  uint32_t num_rows{};                               // number of rows in this chunk
+  int16_t max_level[level_type::NUM_LEVEL_TYPES]{};  // max definition/repetition level
+  int16_t max_nesting_depth{};                       // max nesting depth of the output
+  uint16_t data_type{};  // basic column data type, ((type_length << 3) |
+                         // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES];  // bits to encode max definition/repetition levels
-  int32_t num_data_pages;                     // number of data pages
-  int32_t num_dict_pages;                     // number of dictionary pages
-  int32_t max_num_pages;                      // size of page_info array
-  PageInfo* page_info;                        // output page info for up to num_dict_pages +
-                                              // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index;          // index for string dictionary
-  bitmask_type** valid_map_base;              // base pointers of valid bit map for this column
-  void** column_data_base;                    // base pointers of column data
-  void** column_string_base;                  // base pointers of column string data
-  int8_t codec;                               // compressed codec enum
-  int8_t converted_type;                      // converted type enum
-  LogicalType logical_type;                   // logical type
-  int8_t decimal_precision;                   // Decimal precision
-  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
-
-  int32_t src_col_index;   // my input column index
-  int32_t src_col_schema;  // my schema index in the file
+    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                     // number of data pages
+  int32_t num_dict_pages{};                     // number of dictionary pages
+  int32_t max_num_pages{};                      // size of page_info array
+  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
+                                                // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};          // index for string dictionary
+  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
+  void** column_data_base{};                    // base pointers of column data
+  void** column_string_base{};                  // base pointers of column string data
+  int8_t codec{};                               // compressed codec enum
+  int8_t converted_type{};                      // converted type enum
+  LogicalType logical_type{};                   // logical type
+  int8_t decimal_precision{};                   // Decimal precision
+  int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+
+  int32_t src_col_index{};   // my input column index
+  int32_t src_col_schema{};  // my schema index in the file
 };
 
 /**
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 28e77f62a43..f71fb95949f 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -34,18 +34,18 @@ namespace io {
  * @brief shared state for statistics calculation kernel
  */
 struct stats_state_s {
-  stats_column_desc col;   ///< Column information
-  statistics_group group;  ///< Group description
-  statistics_chunk ck;     ///< Output statistics chunk
+  stats_column_desc col{};   ///< Column information
+  statistics_group group{};  ///< Group description
+  statistics_chunk ck{};     ///< Output statistics chunk
 };
 
 /**
  * @brief shared state for statistics merge kernel
  */
 struct merge_state_s {
-  stats_column_desc col;         ///< Column information
-  statistics_merge_group group;  ///< Group description
-  statistics_chunk ck;           ///< Resulting statistics chunk
+  stats_column_desc col{};         ///< Column information
+  statistics_merge_group group{};  ///< Group description
+  statistics_chunk ck{};           ///< Resulting statistics chunk
 };
 
 template <int dimension>
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 805ca43553e..b6e698fee11 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -98,27 +98,27 @@ union statistics_val {
 };
 
 struct statistics_chunk {
-  uint32_t non_nulls;        //!< number of non-null values in chunk
-  uint32_t null_count;       //!< number of null values in chunk
-  statistics_val min_value;  //!< minimum value in chunk
-  statistics_val max_value;  //!< maximum value in chunk
-  statistics_val sum;        //!< sum of chunk
-  uint8_t has_minmax;        //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;           //!< Nonzero if sum is valid
+  uint32_t non_nulls{};        //!< number of non-null values in chunk
+  uint32_t null_count{};       //!< number of null values in chunk
+  statistics_val min_value{};  //!< minimum value in chunk
+  statistics_val max_value{};  //!< maximum value in chunk
+  statistics_val sum{};        //!< sum of chunk
+  uint8_t has_minmax{};        //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{};           //!< Nonzero if sum is valid
 };
 
 struct statistics_group {
-  stats_column_desc const* col;  //!< Column information
-  uint32_t start_row;            //!< Start row of this group
-  uint32_t num_rows;             //!< Number of rows in group
-  uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
+  stats_column_desc const* col{};  //!< Column information
+  uint32_t start_row{};            //!< Start row of this group
+  uint32_t num_rows{};             //!< Number of rows in group
+  uint32_t non_leaf_nulls{};       //!< Number of null non-leaf values in the group
 };
 
 struct statistics_merge_group {
-  data_type col_dtype;           //!< Column data type
-  statistics_dtype stats_dtype;  //!< Statistics data type for this column
-  uint32_t start_chunk;          //!< Start chunk of this group
-  uint32_t num_chunks;           //!< Number of chunks in group
+  data_type col_dtype;                       //!< Column data type
+  statistics_dtype stats_dtype{dtype_none};  //!< Statistics data type for this column
+  uint32_t start_chunk{};                    //!< Start chunk of this group
+  uint32_t num_chunks{};                     //!< Number of chunks in group
 };
 
 template <typename T, std::enable_if_t<!std::is_same_v<T, statistics::byte_array_view>>* = nullptr>

From bff0fcd721320210c53d3533e63fb34eac883f4e Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 27 Sep 2023 11:25:25 -0700
Subject: [PATCH 17/29] [Java] Add JNI bindings for `integers_to_hex` (#14205)

This PR adds a method to ColumnView class to allow for conversion from Integers to hex
closes #14081

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Kuhu Shukla (https://github.com/kuhushukla)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/14205
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 27 +++++++++++++++++++
 java/src/main/java/ai/rapids/cudf/DType.java  | 19 +++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    |  9 +++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 10 +++++++
 4 files changed, 65 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 3f3a55f0970..0b66701629b 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -4089,6 +4089,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
 
   private static native long isFixedPoint(long viewHandle, int nativeTypeId, int scale);
 
+  private static native long toHex(long viewHandle);
+
   /**
    * Native method to concatenate a list column of strings (each row is a list of strings),
    * concatenates the strings within each row and returns a single strings column result.
@@ -5231,4 +5233,29 @@ static ColumnView[] getColumnViewsFromPointers(long[] nativeHandles) {
       }
     }
   }
+
+  /**
+   * Convert this integer column to hexadecimal column and return a new strings column
+   *
+   * Any null entries will result in corresponding null entries in the output column.
+   *
+   * The output character set is '0'-'9' and 'A'-'F'. The output string width will
+   * be a multiple of 2 depending on the size of the integer type. A single leading
+   * zero is applied to the first non-zero output byte if it is less than 0x10.
+   *
+   * Example:
+   * input = [123, -1, 0, 27, 342718233]
+   * s = input.toHex()
+   * s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
+   *
+   * The example above shows an `INT32` type column where each integer is 4 bytes.
+   * Leading zeros are suppressed unless filling out a complete byte as in
+   * `123 -> '04D2'` instead of `000004D2` or `4D2`.
+   *
+   * @return new string ColumnVector
+   */
+  public ColumnVector toHex() {
+    assert getType().isIntegral() : "Only integers are supported";
+    return new ColumnVector(toHex(this.getNativeView()));
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index d0bb7761da4..07bc4fe3bbf 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -413,6 +413,14 @@ public boolean isDurationType() {
   }
 
   /**
+   * Returns true for strictly Integer types not a type backed by
+   * ints
+   */
+  public boolean isIntegral() {
+    return INTEGRALS.contains(this.typeId);
+  }
+
+ /**
    * Returns true for nested types
    */
   public boolean isNestedType() {
@@ -506,4 +514,15 @@ public boolean hasOffsets() {
       DTypeEnum.STRING,
       DTypeEnum.LIST
   );
+
+  private static final EnumSet<DTypeEnum> INTEGRALS = EnumSet.of(
+    DTypeEnum.INT8,
+    DTypeEnum.INT16,
+    DTypeEnum.INT32,
+    DTypeEnum.INT64,
+    DTypeEnum.UINT8,
+    DTypeEnum.UINT16,
+    DTypeEnum.UINT32,
+    DTypeEnum.UINT64
+  );
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index d5aad03645f..0ddaa2c15b5 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2563,4 +2563,13 @@ Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong col
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv *env, jclass, jlong input_ptr) {
+  JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    return release_as_jlong(cudf::strings::integers_to_hex(*input));
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f6dffc88b92..9a0f8bda994 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -6876,4 +6876,14 @@ public void testUseAfterFree() {
     vector.close();
     assertThrows(NullPointerException.class, vector::getDeviceMemorySize);
   }
+
+  @Test
+  public void testConvertIntegerToHex() {
+    try (
+      ColumnVector input = ColumnVector.fromInts(14, 2621, 50);
+      ColumnVector expected = ColumnVector.fromStrings("0E", "0A3D", "32");
+      ColumnVector actual = input.toHex()) {
+        assertColumnsAreEqual(expected, actual);
+    }
+  }
 }

From 66ac962dbeb69eade22b3bcaf186e3df2bae71b5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 27 Sep 2023 12:20:20 -0700
Subject: [PATCH 18/29] JNI for `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations
 (#14154)

This implements JNI for  `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations in both groupby and reduction.

Depends on:
 * https://github.com/rapidsai/cudf/pull/14045

Contributes to:
 * https://github.com/rapidsai/cudf/issues/13885.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/14154
---
 .../main/java/ai/rapids/cudf/Aggregation.java |  26 ++++-
 .../ai/rapids/cudf/GroupByAggregation.java    |  24 +++-
 .../ai/rapids/cudf/ReductionAggregation.java  |  20 +++-
 java/src/main/native/src/AggregationJni.cpp   |   7 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 109 ++++++++++++++++++
 5 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index d10329ca0f2..379750bb0b7 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -68,7 +68,9 @@ enum Kind {
         DENSE_RANK(29),
         PERCENT_RANK(30),
         TDIGEST(31), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(32); // This can take a delta argument for accuracy level
+        MERGE_TDIGEST(32), // This can take a delta argument for accuracy level
+        HISTOGRAM(33),
+        MERGE_HISTOGRAM(34);
 
         final int nativeId;
 
@@ -918,6 +920,26 @@ static TDigestAggregation mergeTDigest(int delta) {
         return new TDigestAggregation(Kind.MERGE_TDIGEST, delta);
     }
 
+    static final class HistogramAggregation extends NoParamAggregation {
+        private HistogramAggregation() {
+            super(Kind.HISTOGRAM);
+        }
+    }
+
+    static final class MergeHistogramAggregation extends NoParamAggregation {
+        private MergeHistogramAggregation() {
+            super(Kind.MERGE_HISTOGRAM);
+        }
+    }
+
+    static HistogramAggregation histogram() {
+        return new HistogramAggregation();
+    }
+
+    static MergeHistogramAggregation mergeHistogram() {
+        return new MergeHistogramAggregation();
+    }
+
     /**
      * Create one of the aggregations that only needs a kind, no other parameters. This does not
      * work for all types and for code safety reasons each kind is added separately.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 500d18f7eae..0fae33927b6 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -315,4 +315,26 @@ public static GroupByAggregation createTDigest(int delta) {
   public static GroupByAggregation mergeTDigest(int delta) {
     return new GroupByAggregation(Aggregation.mergeTDigest(delta));
   }
+
+  /**
+   * Histogram aggregation, computing the frequencies for each unique row.
+   *
+   * A histogram is given as a lists column, in which the first child stores unique rows from
+   * the input values and the second child stores their corresponding frequencies.
+   *
+   * @return A lists of structs column in which each list contains a histogram corresponding to
+   *         an input key.
+   */
+  public static GroupByAggregation histogram() {
+    return new GroupByAggregation(Aggregation.histogram());
+  }
+
+  /**
+   * MergeHistogram aggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
+  public static GroupByAggregation mergeHistogram() {
+    return new GroupByAggregation(Aggregation.mergeHistogram());
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index eab1c94fd2c..ba8ae379bae 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -286,4 +286,22 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Create HistogramAggregation, computing the frequencies for each unique row.
+   *
+   * @return A structs column in which the first child stores unique rows from the input and the
+   *         second child stores their corresponding frequencies.
+   */
+  public static ReductionAggregation histogram() {
+    return new ReductionAggregation(Aggregation.histogram());
+  }
+
+  /**
+   * Create MergeHistogramAggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
+  public static ReductionAggregation mergeHistogram() {
+    return new ReductionAggregation(Aggregation.mergeHistogram());
+  }
 }
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 6ac73282615..bc62e95c36a 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -90,6 +90,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 30: // ANSI SQL PERCENT_RANK
           return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
                                              {}, cudf::rank_percentage::ONE_NORMALIZED);
+        case 33: // HISTOGRAM
+          return cudf::make_histogram_aggregation();
+        case 34: // MERGE_HISTOGRAM
+          return cudf::make_merge_histogram_aggregation();
+
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 59f0d180c6e..faa73ac4322 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -4129,6 +4129,115 @@ void testMergeTDigestReduction() {
     }
   }
 
+  @Test
+  void testGroupbyHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+    ListType histogramList = new ListType(false, histogramStruct);
+
+    // key = 0: values = [2, 2, -3, -2, 2]
+    // key = 1: values = [2, 0, 5, 2, 1]
+    // key = 2: values = [-3, 1, 1, 2, 2]
+    try (Table input = new Table.TestBuilder()
+        .column(2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2)
+        .column(-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2)
+        .build();
+         Table result = input.groupBy(0)
+             .aggregate(GroupByAggregation.histogram().onColumn(1));
+         Table sortedResult = result.orderBy(OrderByArg.asc(0));
+         ColumnVector sortedOutHistograms = sortedResult.getColumn(1).listSortRows(false, false);
+
+         ColumnVector expectedKeys = ColumnVector.fromInts(0, 1, 2);
+         ColumnVector expectedHistograms = ColumnVector.fromLists(histogramList,
+             Arrays.asList(new StructData(-3, 1L), new StructData(-2, 1L), new StructData(2, 3L)),
+             Arrays.asList(new StructData(0, 1L), new StructData(1, 1L), new StructData(2, 2L),
+                 new StructData(5, 1L)),
+             Arrays.asList(new StructData(-3, 1L), new StructData(1, 2L), new StructData(2, 2L)))
+    ) {
+      assertColumnsAreEqual(expectedKeys, sortedResult.getColumn(0));
+      assertColumnsAreEqual(expectedHistograms, sortedOutHistograms);
+    }
+  }
+
+  @Test
+  void testGroupbyMergeHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+    ListType histogramList = new ListType(false, histogramStruct);
+
+    // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+    // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+    try (Table input = new Table.TestBuilder()
+        .column(0, 1, 0, 1, 0)
+        .column(histogramStruct,
+            new StructData[]{new StructData(-3, 1L), new StructData(-2, 1L), new StructData(2, 3L)},
+            new StructData[]{new StructData(-2, 1L), new StructData(1, 3L), new StructData(2, 2L)},
+            new StructData[]{new StructData(0, 1L), new StructData(1, 1L)},
+            new StructData[]{new StructData(0, 2L), new StructData(1, 1L), new StructData(2, 2L)},
+            new StructData[]{new StructData(-3, 3L), new StructData(0, 1L), new StructData(1, 2L)})
+        .build();
+         Table result = input.groupBy(0)
+             .aggregate(GroupByAggregation.mergeHistogram().onColumn(1));
+         Table sortedResult = result.orderBy(OrderByArg.asc(0));
+         ColumnVector sortedOutHistograms = sortedResult.getColumn(1).listSortRows(false, false);
+
+         ColumnVector expectedKeys = ColumnVector.fromInts(0, 1);
+         ColumnVector expectedHistograms = ColumnVector.fromLists(histogramList,
+             Arrays.asList(new StructData(-3, 4L), new StructData(-2, 1L), new StructData(0, 2L),
+                           new StructData(1, 3L), new StructData(2, 3L)),
+             Arrays.asList(new StructData(-2, 1L), new StructData(0, 2L), new StructData(1, 4L),
+                           new StructData(2, 4L)))
+    ) {
+      assertColumnsAreEqual(expectedKeys, sortedResult.getColumn(0));
+      assertColumnsAreEqual(expectedHistograms, sortedOutHistograms);
+    }
+  }
+
+  @Test
+  void testReductionHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+
+    try (ColumnVector input = ColumnVector.fromInts(-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1);
+         Scalar result = input.reduce(ReductionAggregation.histogram(), DType.LIST);
+         ColumnVector resultCV = result.getListAsColumnView().copyToColumnVector();
+         Table resultTable = new Table(resultCV);
+         Table sortedResult = resultTable.orderBy(OrderByArg.asc(0));
+
+         ColumnVector expectedHistograms = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 2L), new StructData(-2, 1L), new StructData(0, 1L),
+             new StructData(1, 2L), new StructData(2, 4L), new StructData(5, 1L))
+    ) {
+      assertColumnsAreEqual(expectedHistograms, sortedResult.getColumn(0));
+    }
+  }
+
+  @Test
+  void testReductionMergeHistogram() {
+    StructType histogramStruct = new StructType(false,
+        new BasicType(false, DType.INT32), // values
+        new BasicType(false, DType.INT64)); // frequencies
+
+    try (ColumnVector input = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 2L), new StructData(2, 1L), new StructData(1, 1L),
+             new StructData(2, 2L), new StructData(0, 4L), new StructData(5, 1L),
+             new StructData(2, 2L), new StructData(-3, 3L), new StructData(-2, 5L),
+             new StructData(2, 3L), new StructData(1, 4L));
+         Scalar result = input.reduce(ReductionAggregation.mergeHistogram(), DType.LIST);
+         ColumnVector resultCV = result.getListAsColumnView().copyToColumnVector();
+         Table resultTable = new Table(resultCV);
+         Table sortedResult = resultTable.orderBy(OrderByArg.asc(0));
+
+         ColumnVector expectedHistograms = ColumnVector.fromStructs(histogramStruct,
+             new StructData(-3, 5L), new StructData(-2, 5L), new StructData(0, 4L),
+             new StructData(1, 5L), new StructData(2, 8L), new StructData(5, 1L))
+    ) {
+      assertColumnsAreEqual(expectedHistograms, sortedResult.getColumn(0));
+    }
+  }
   @Test
   void testGroupByMinMaxDecimal() {
     try (Table t1 = new Table.TestBuilder()

From b789d4ce3c090a3f25a8657d9a8582a1edb54f12 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 27 Sep 2023 12:20:46 -0700
Subject: [PATCH 19/29] Preserve name of the column while initializing a
 `DataFrame` (#14110)

Fixes: #14088

This PR preserves `names` of `column` object while constructing a `DataFrame` through various constructor flows.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/14110
---
 python/cudf/cudf/core/column_accessor.py |  2 --
 python/cudf/cudf/core/dataframe.py       | 26 ++++++++++++++++++---
 python/cudf/cudf/core/indexed_frame.py   |  4 +++-
 python/cudf/cudf/tests/test_dataframe.py | 29 ++++++++++++++++++++----
 4 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bec9c367ba9..cb79a30422e 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -197,8 +197,6 @@ def nlevels(self) -> int:
 
     @property
     def name(self) -> Any:
-        if len(self._data) == 0:
-            return None
         return self.level_names[-1]
 
     @property
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8a3dbe77787..ead2f182e2d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -665,7 +665,10 @@ def __init__(
                             len(self), dtype="object", masked=True
                         )
                         for k in columns
-                    }
+                    },
+                    level_names=tuple(columns.names)
+                    if isinstance(columns, pd.Index)
+                    else None,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -712,6 +715,11 @@ def __init__(
 
                     self._data = new_df._data
                     self._index = new_df._index
+                    self._data._level_names = (
+                        tuple(columns.names)
+                        if isinstance(columns, pd.Index)
+                        else self._data._level_names
+                    )
                 elif len(data) > 0 and isinstance(data[0], Series):
                     self._init_from_series_list(
                         data=data, columns=columns, index=index
@@ -834,6 +842,11 @@ def _init_from_series_list(self, data, columns, index):
                     self._data[col_name] = column.column_empty(
                         row_count=len(self), dtype=None, masked=True
                     )
+            self._data._level_names = (
+                tuple(columns.names)
+                if isinstance(columns, pd.Index)
+                else self._data._level_names
+            )
             self._data = self._data.select_by_label(columns)
 
     @_cudf_nvtx_annotate
@@ -957,6 +970,11 @@ def _init_from_dict_like(
                     data[col_name],
                     nan_as_null=nan_as_null,
                 )
+        self._data._level_names = (
+            tuple(columns.names)
+            if isinstance(columns, pd.Index)
+            else self._data._level_names
+        )
 
     @classmethod
     def _from_data(
@@ -5131,7 +5149,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
 
         index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null)
         df = cls._from_data(data, index)
-        df._data._level_names = list(dataframe.columns.names)
+        df._data._level_names = tuple(dataframe.columns.names)
 
         # Set columns only if it is a MultiIndex
         if isinstance(dataframe.columns, pd.MultiIndex):
@@ -5377,6 +5395,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             df = df.set_index(index)
         else:
             df._index = as_index(index)
+        if isinstance(columns, pd.Index):
+            df._data._level_names = tuple(columns.names)
         return df
 
     @classmethod
@@ -5434,7 +5454,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
                 data, nan_as_null=nan_as_null
             )
         if isinstance(columns, pd.Index):
-            df._data._level_names = list(columns.names)
+            df._data._level_names = tuple(columns.names)
 
         if index is None:
             df._index = RangeIndex(start=0, stop=len(data))
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index aacf1fa8dae..1008cbdb67f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2661,7 +2661,9 @@ def _reindex(
             data=cudf.core.column_accessor.ColumnAccessor(
                 cols,
                 multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
+                level_names=tuple(column_names.names)
+                if isinstance(column_names, pd.Index)
+                else None,
             ),
             index=index,
         )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 67b63028fab..c297748f7e5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -6394,6 +6394,7 @@ def test_df_series_dataframe_astype_dtype_dict(copy):
         ([range(100), range(100)], ["range" + str(i) for i in range(100)]),
         (((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]),
         ([[1, 2, 3]], ["list col1", "list col2", "list col3"]),
+        ([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")),
         ([range(100)], ["range" + str(i) for i in range(100)]),
         (((1, 2, 3),), ["k1", "k2", "k3"]),
     ],
@@ -7969,6 +7970,7 @@ def test_series_empty(ps):
 @pytest.mark.parametrize(
     "data",
     [
+        None,
         [],
         [1],
         {"a": [10, 11, 12]},
@@ -7979,7 +7981,10 @@ def test_series_empty(ps):
         },
     ],
 )
-@pytest.mark.parametrize("columns", [["a"], ["another column name"], None])
+@pytest.mark.parametrize(
+    "columns",
+    [["a"], ["another column name"], None, pd.Index(["a"], name="index name")],
+)
 def test_dataframe_init_with_columns(data, columns):
     pdf = pd.DataFrame(data, columns=columns)
     gdf = cudf.DataFrame(data, columns=columns)
@@ -8047,7 +8052,16 @@ def test_dataframe_init_with_columns(data, columns):
     ],
 )
 @pytest.mark.parametrize(
-    "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]]
+    "columns",
+    [
+        None,
+        ["0"],
+        [0],
+        ["abc"],
+        [144, 13],
+        [2, 1, 0],
+        pd.Index(["abc"], name="custom_name"),
+    ],
 )
 def test_dataframe_init_from_series_list(data, ignore_dtype, columns):
     gd_data = [cudf.from_pandas(obj) for obj in data]
@@ -10239,14 +10253,21 @@ def test_dataframe_binop_with_datetime_index():
 
 
 @pytest.mark.parametrize(
-    "columns", ([], ["c", "a"], ["a", "d", "b", "e", "c"], ["a", "b", "c"])
+    "columns",
+    (
+        [],
+        ["c", "a"],
+        ["a", "d", "b", "e", "c"],
+        ["a", "b", "c"],
+        pd.Index(["b", "a", "c"], name="custom_name"),
+    ),
 )
 @pytest.mark.parametrize("index", (None, [4, 5, 6]))
 def test_dataframe_dict_like_with_columns(columns, index):
     data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
     expect = pd.DataFrame(data, columns=columns, index=index)
     actual = cudf.DataFrame(data, columns=columns, index=index)
-    if index is None and columns == []:
+    if index is None and len(columns) == 0:
         # We make an empty range index, pandas makes an empty index
         expect = expect.reset_index(drop=True)
     assert_eq(expect, actual)

From 2c19bf328ffefb97d17e5ae600197a4ea9ca4445 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 27 Sep 2023 20:37:04 -0700
Subject: [PATCH 20/29] Propagate errors from Parquet reader kernels back to
 host (#14167)

Pass the error code to the host when a kernel detects invalid input.
If multiple errors types are detected, they are combined using a bitwise OR so that caller gets the aggregate error code that includes all types of errors that occurred.

Does not change the kernel side checks.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/nvdbaranec
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14167
---
 cpp/src/io/parquet/page_data.cu          | 25 ++++++++---
 cpp/src/io/parquet/page_decode.cuh       | 57 +++++++++++++++++-------
 cpp/src/io/parquet/page_delta_decode.cu  | 25 ++++++++---
 cpp/src/io/parquet/page_string_decode.cu | 25 ++++++++---
 cpp/src/io/parquet/parquet_gpu.hpp       | 21 +++++++++
 cpp/src/io/parquet/reader_impl.cpp       | 19 ++++++--
 6 files changed, 130 insertions(+), 42 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index c26802aa3c2..230834632dd 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -430,10 +430,15 @@ static __device__ void gpuOutputGeneric(
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
  */
 template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodePageData(PageInfo* pages,
+                    device_span<ColumnChunkDesc const> chunks,
+                    size_t min_row,
+                    size_t num_rows,
+                    int32_t* error_code)
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(16)
@@ -472,7 +477,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     int target_pos;
     int src_pos = s->src_pos;
 
@@ -596,6 +602,10 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodePageData(
     }
     __syncthreads();
   }
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 struct mask_tform {
@@ -621,6 +631,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             int32_t* error_code,
                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -629,11 +640,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageData<rolling_buf_size, uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodePageData<rolling_buf_size, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodePageData<rolling_buf_size, uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodePageData<rolling_buf_size, uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 5e66885d746..cdc29197eb3 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -21,6 +21,7 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <cuda/atomic>
 #include <cuda/std/tuple>
 
 namespace cudf::io::parquet::gpu {
@@ -69,6 +70,18 @@ struct page_state_s {
   PageNestingDecodeInfo nesting_decode_cache[max_cacheable_nesting_decode_info]{};
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
   PageNestingDecodeInfo* nesting_info{};
+
+  inline __device__ void set_error_code(decode_error err) volatile
+  {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    ref.fetch_or(static_cast<int32_t>(err), cuda::std::memory_order_relaxed);
+  }
+
+  inline __device__ void reset_error_code() volatile
+  {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    ref.store(0, cuda::std::memory_order_release);
+  }
 };
 
 // buffers only used in the decode kernel.  separated from page_state_s to keep
@@ -471,7 +484,7 @@ __device__ void gpuDecodeStream(
   int32_t value_count       = s->lvl_count[lvl];
   int32_t batch_coded_count = 0;
 
-  while (value_count < target_count && value_count < num_input_values) {
+  while (s->error == 0 && value_count < target_count && value_count < num_input_values) {
     int batch_len;
     if (level_run <= 1) {
       // Get a new run symbol from the byte stream
@@ -487,7 +500,14 @@ __device__ void gpuDecodeStream(
             cur++;
           }
         }
-        if (cur > end || level_run <= 1) { s->error = 0x10; }
+        if (cur > end) {
+          s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN);
+          break;
+        }
+        if (level_run <= 1) {
+          s->set_error_code(decode_error::INVALID_LEVEL_RUN);
+          break;
+        }
         sym_len = (int32_t)(cur - cur_def);
         __threadfence_block();
       }
@@ -496,7 +516,7 @@ __device__ void gpuDecodeStream(
       level_run = shuffle(level_run);
       cur_def += sym_len;
     }
-    if (s->error) { break; }
+    if (s->error != 0) { break; }
 
     batch_len = min(num_input_values - value_count, 32);
     if (level_run & 1) {
@@ -852,7 +872,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 
   constexpr int batch_size = 32;
   int cur_leaf_count       = target_leaf_count;
-  while (!s->error && s->nz_count < target_leaf_count &&
+  while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
     if (has_repetition) {
       gpuDecodeStream<level_t, rolling_buf_size>(rep, s, cur_leaf_count, t, level_type::REPETITION);
@@ -916,7 +936,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
     }
     s->lvl_start[lvl] = cur;
 
-    if (cur > end) { s->error = 2; }
+    if (cur > end) { s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN); }
   };
 
   // this is a little redundant. if level_bits == 0, then nothing should be encoded
@@ -941,8 +961,8 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
       // add back the 4 bytes for the length
       len += 4;
     } else {
-      len      = 0;
-      s->error = 2;
+      len = 0;
+      s->set_error_code(decode_error::LEVEL_STREAM_OVERRUN);
     }
   } else if (encoding == Encoding::BIT_PACKED) {
     len                       = (s->page.num_input_values * level_bits + 7) >> 3;
@@ -951,8 +971,8 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
     s->lvl_start[lvl]         = cur;
     s->abs_lvl_start[lvl]     = cur;
   } else {
-    s->error = 3;
-    len      = 0;
+    len = 0;
+    s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
   }
 
   s->abs_lvl_end[lvl] = start + len;
@@ -1094,7 +1114,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   }
 
   if (!t) {
-    s->error = 0;
+    s->reset_error_code();
 
     // IMPORTANT : nested schemas can have 0 rows in a page but still have
     // values. The case is:
@@ -1152,7 +1172,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           break;
         default:  // FIXED_LEN_BYTE_ARRAY:
           s->dtype_len = dtype_len_out;
-          s->error |= (s->dtype_len <= 0);
+          if (s->dtype_len <= 0) { s->set_error_code(decode_error::INVALID_DATA_TYPE); }
           break;
       }
       // Special check for downconversions
@@ -1268,7 +1288,9 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run  = 0;
           s->dict_val  = 0;
           s->dict_bits = (cur < end) ? *cur++ : 0;
-          if (s->dict_bits > 32 || !s->dict_base) { s->error = (10 << 8) | s->dict_bits; }
+          if (s->dict_bits > 32 || !s->dict_base) {
+            s->set_error_code(decode_error::INVALID_DICT_WIDTH);
+          }
           break;
         case Encoding::PLAIN:
           s->dict_size = static_cast<int32_t>(end - cur);
@@ -1279,22 +1301,23 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           // first 4 bytes are length of RLE data
           int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
           cur += 4;
-          if (cur + len > end) { s->error = 2; }
+          if (cur + len > end) { s->set_error_code(decode_error::DATA_STREAM_OVERRUN); }
           s->dict_run = 0;
         } break;
         case Encoding::DELTA_BINARY_PACKED:
           // nothing to do, just don't error
           break;
-        default:
-          s->error = 1;  // Unsupported encoding
+        default: {
+          s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
           break;
+        }
       }
-      if (cur > end) { s->error = 1; }
+      if (cur > end) { s->set_error_code(decode_error::DATA_STREAM_OVERRUN); }
       s->lvl_end    = cur;
       s->data_start = cur;
       s->data_end   = end;
     } else {
-      s->error = 1;
+      s->set_error_code(decode_error::EMPTY_PAGE);
     }
 
     s->lvl_count[level_type::REPETITION] = 0;
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 35f33a761be..2b78dead205 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -32,8 +32,12 @@ namespace {
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
 // this kernel only needs 96 threads (3 warps)(for now).
 template <typename level_t>
-__global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(96)
+  gpuDecodeDeltaBinary(PageInfo* pages,
+                       device_span<ColumnChunkDesc const> chunks,
+                       size_t min_row,
+                       size_t num_rows,
+                       int32_t* error_code)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
@@ -79,7 +83,8 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
   // that has a value we need.
   if (skipped_leaf_values > 0) { db->skip_values(skipped_leaf_values); }
 
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     uint32_t target_pos;
     uint32_t const src_pos = s->src_pos;
 
@@ -145,6 +150,11 @@ __global__ void __launch_bounds__(96) gpuDecodeDeltaBinary(
     }
     __syncthreads();
   }
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // anonymous namespace
@@ -157,6 +167,7 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                int32_t* error_code,
                                 rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -165,11 +176,11 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodeDeltaBinary<uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeDeltaBinary<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodeDeltaBinary<uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeDeltaBinary<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 1ac4c95f713..d79abe4a6d2 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -582,8 +582,12 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
-  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodeStringPageData(PageInfo* pages,
+                          device_span<ColumnChunkDesc const> chunks,
+                          size_t min_row,
+                          size_t num_rows,
+                          int32_t* error_code)
 {
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(4) size_type last_offset;
@@ -617,7 +621,8 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
-  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
     int target_pos;
     int src_pos = s->src_pos;
 
@@ -742,6 +747,11 @@ __global__ void __launch_bounds__(decode_block_size) gpuDecodeStringPageData(
 
   auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // anonymous namespace
@@ -775,6 +785,7 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
                                    size_t num_rows,
                                    size_t min_row,
                                    int level_type_size,
+                                   int32_t* error_code,
                                    rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -783,11 +794,11 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodeStringPageData<uint8_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeStringPageData<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   } else {
-    gpuDecodeStringPageData<uint16_t>
-      <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
+    gpuDecodeStringPageData<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a760c2448dc..3c37c0df021 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -54,6 +54,21 @@ constexpr int rolling_index(int index)
   return index % rolling_size;
 }
 
+/**
+ * @brief Enum for the different types of errors that can occur during decoding.
+ *
+ * These values are used as bitmasks, so they must be powers of 2.
+ */
+enum class decode_error : int32_t {
+  DATA_STREAM_OVERRUN  = 0x1,
+  LEVEL_STREAM_OVERRUN = 0x2,
+  UNSUPPORTED_ENCODING = 0x4,
+  INVALID_LEVEL_RUN    = 0x8,
+  INVALID_DATA_TYPE    = 0x10,
+  EMPTY_PAGE           = 0x20,
+  INVALID_DICT_WIDTH   = 0x40,
+};
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -566,6 +581,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -573,6 +589,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                     size_t num_rows,
                     size_t min_row,
                     int level_type_size,
+                    int32_t* error_code,
                     rmm::cuda_stream_view stream);
 
 /**
@@ -586,6 +603,7 @@ void DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -593,6 +611,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          int32_t* error_code,
                           rmm::cuda_stream_view stream);
 
 /**
@@ -606,6 +625,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use, default 0
  */
 void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
@@ -613,6 +633,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        size_t num_rows,
                        size_t min_row,
                        int level_type_size,
+                       int32_t* error_code,
                        rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 8b0a0bd4eb0..6cbe64e227b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -163,6 +163,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
+  rmm::device_scalar<int32_t> error_code(0, _stream);
+
   // get the number of streams we need from the pool and tell them to wait on the H2D copies
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
@@ -174,17 +176,20 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, stream);
+    gpu::DecodeStringPageData(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
   if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
+    gpu::DecodeDeltaBinary(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
+    gpu::DecodePageData(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // synchronize the streams
@@ -193,7 +198,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
-  _stream.synchronize();
+
+  auto const decode_error = error_code.value(_stream);
+  if (decode_error != 0) {
+    std::stringstream stream;
+    stream << std::hex << decode_error;
+    CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str());
+  }
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each

From 53f0f74f6c6d66441225278f19a69885fb8b43c6 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 27 Sep 2023 23:32:46 -0500
Subject: [PATCH 21/29] Support for progressive parquet chunked reading.
 (#14079)

Previously, the parquet chunked reader operated by controlling the size of output chunks only.  It would still ingest the entire input file and decompress it, which can take up a considerable amount of memory.  With this new 'progressive' support, we also 'chunk' at the input level.  Specifically, the user can pass a `pass_read_limit` value which controls how much memory is used for storing compressed/decompressed data.  The reader will make multiple passes over the file, reading as many row groups as it can to attempt to fit within this limit.  Within each pass, chunks are emitted as before.

From the external user's perspective, the chunked read mechanism is the same.  You call `has_next()` and `read_chunk()`.  If the user has specified a value for `pass_read_limit` the set of chunks produced might end up being different (although the concatenation of all of them will still be the same).

The core idea of the code change is to add the idea of the internal `pass`.  Previously we had a `file_intermediate_data` which held data across `read_chunk()` calls.   There is now a `pass_intermediate_data` struct which holds information specific to a given pass.  Many of the invariant things from the file level before (row groups and chunks to process) are now stored in the pass intermediate data.  As we begin each pass, we take the subset of global row groups and chunks that we are going to process for this pass, copy them to out intermediate data, and the remainder of the reader reference this instead of the file-level data.

In order to avoid breaking pre-existing interfaces, there's a new contructor for the `chunked_parquet_reader` class:

```
  chunked_parquet_reader(
    std::size_t chunk_read_limit,
    std::size_t pass_read_limit,
    parquet_reader_options const& options,
    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14079
---
 cpp/include/cudf/io/detail/parquet.hpp       |  39 ++-
 cpp/include/cudf/io/parquet.hpp              |  24 ++
 cpp/src/io/functions.cpp                     |  17 +
 cpp/src/io/parquet/parquet_gpu.hpp           |  69 +++-
 cpp/src/io/parquet/reader.cpp                |   4 +-
 cpp/src/io/parquet/reader_impl.cpp           | 128 ++++---
 cpp/src/io/parquet/reader_impl.hpp           |  52 ++-
 cpp/src/io/parquet/reader_impl_helpers.cpp   |   4 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  15 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 344 +++++++++++++------
 cpp/tests/io/parquet_chunked_reader_test.cpp |  68 +++-
 11 files changed, 561 insertions(+), 203 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 3f2e1fa5e6c..074f690d2c7 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -91,7 +91,8 @@ class reader {
 class chunked_reader : private reader {
  public:
   /**
-   * @brief Constructor from a read size limit and an array of data sources with reader options.
+   * @brief Constructor from an output size memory limit and an input size memory limit and an array
+   * of data sources with reader options.
    *
    * The typical usage should be similar to this:
    * ```
@@ -102,17 +103,45 @@ class chunked_reader : private reader {
    *
    * ```
    *
-   * If `chunk_read_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
-   * whole file and return a table containing all rows.
+   * If `chunk_read_limit == 0` (i.e., no output limit), and `pass_read_limit == 0` (no input
+   * temporary memory size limit) a call to `read_chunk()` will read the whole file and return a
+   * table containing all rows.
+   *
+   * The chunk_read_limit parameter controls the size of the output chunks produces.  If the user
+   * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have
+   * a total bytes size (over all columns) of 100 MB or less.  This is a soft limit and the code
+   * will not fail if it cannot satisfy the limit.  It will make a best-effort atttempt only.
+   *
+   * The pass_read_limit parameter controls how much temporary memory is used in the process of
+   * decoding the file.  The primary contributor to this memory usage is the uncompressed size of
+   * the data read out of the file and the decompressed (but not yet decoded) size of the data. The
+   * granularity of a given pass is at the row group level. It will not attempt to read at the sub
+   * row-group level.
+   *
+   * Combined, the way to visualize passes and chunks is as follows:
+   *
+   * @code{.pseudo}
+   * for(each pass){
+   *    for(each output chunk within a pass){
+   *       return a table that fits within the output chunk limit
+   *    }
+   *  }
+   * @endcode
+   *
+   * With a pass_read_limit of `0` you are simply saying you have one pass that reads the entire
+   * file as normal.
    *
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
-   *        or `0` if there is no limit
+   * or `0` if there is no limit
+   * @param pass_read_limit Limit on total amount of memory used for temporary computations during
+   * loading, or `0` if there is no limit
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           parquet_reader_options const& options,
                           rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 788ff15f3c1..deaf23d405a 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -445,6 +445,30 @@ class chunked_parquet_reader {
     parquet_reader_options const& options,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `parquet_reader_option` parameter as in
+   * `cudf::read_parquet()`, with additional parameters to specify the size byte limit of the
+   * output table for each reading, and a byte limit on the amount of temporary memory to use
+   * when reading. pass_read_limit affects how many row groups we can read at a time by limiting
+   * the amount of memory dedicated to decompression space. pass_read_limit is a hint, not an
+   * absolute limit - if a single row group cannot fit within the limit given, it will still be
+   * loaded.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   * or `0` if there is no limit
+   * @param pass_read_limit Limit on the amount of memory used for reading and decompressing data or
+   * `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  chunked_parquet_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    parquet_reader_options const& options,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Destructor, destroying the internal reader instance.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 45f8b0f8822..392a7850886 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -562,6 +562,23 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::mr::device_memory_resource* mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
+                                                            0,
+                                                            make_datasources(options.get_source()),
+                                                            options,
+                                                            cudf::get_default_stream(),
+                                                            mr)}
+{
+}
+
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
+ */
+chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
+                                               std::size_t pass_read_limit,
+                                               parquet_reader_options const& options,
+                                               rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
+                                                            pass_read_limit,
                                                             make_datasources(options.get_source()),
                                                             options,
                                                             cudf::get_default_stream(),
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3c37c0df021..51c862b376b 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -321,33 +321,74 @@ struct ColumnChunkDesc {
 };
 
 /**
- * @brief Struct to store raw/intermediate file data before parsing.
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks for the file.
  */
 struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<gpu::ColumnChunkDesc> chunks{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
   std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
   rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
   cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
   cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
   cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
   cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
 
-  rmm::device_buffer level_decode_data;
-  int level_type_size;
-};
-
-/**
- * @brief Struct to store intermediate page data for parsing each chunk of rows in chunked reading.
- */
-struct chunk_intermediate_data {
   rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
   rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-};
 
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
+  std::vector<gpu::chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
   size_t skip_rows;
   size_t num_rows;
 };
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 7365c102d8f..1e87447006d 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -43,12 +43,14 @@ table_with_metadata reader::read(parquet_reader_options const& options)
 }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                parquet_reader_options const& options,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  _impl = std::make_unique<impl>(chunk_read_limit, std::move(sources), options, stream, mr);
+  _impl = std::make_unique<impl>(
+    chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
 }
 
 chunked_reader::~chunked_reader() = default;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 6cbe64e227b..ea40f29a070 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -29,10 +29,10 @@ namespace cudf::io::detail::parquet {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _file_itm_data.chunks;
-  auto& pages               = _file_itm_data.pages_info;
-  auto& page_nesting        = _file_itm_data.page_nesting_info;
-  auto& page_nesting_decode = _file_itm_data.page_nesting_decode_info;
+  auto& chunks              = _pass_itm_data->chunks;
+  auto& pages               = _pass_itm_data->pages_info;
+  auto& page_nesting        = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -55,7 +55,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     gpu::ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _file_itm_data.level_type_size, _stream);
+      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -169,7 +169,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  auto const level_type_size = _file_itm_data.level_type_size;
+  auto const level_type_size = _pass_itm_data->level_type_size;
 
   // launch string decoder
   int s_idx = 0;
@@ -277,6 +277,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
   : impl(0 /*chunk_read_limit*/,
+         0 /*input_pass_read_limit*/,
          std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
          options,
          stream,
@@ -285,11 +286,16 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 }
 
 reader::impl::impl(std::size_t chunk_read_limit,
+                   std::size_t pass_read_limit,
                    std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit}
+  : _stream{stream},
+    _mr{mr},
+    _sources{std::move(sources)},
+    _output_chunk_read_limit{chunk_read_limit},
+    _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
@@ -313,11 +319,8 @@ reader::impl::impl(std::size_t chunk_read_limit,
                               _timestamp_type.id());
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
-  // Don't need to do it if we read the file all at once.
-  if (_chunk_read_limit > 0) {
-    for (auto const& buff : _output_buffers) {
-      _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
-    }
+  for (auto const& buff : _output_buffers) {
+    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -327,32 +330,62 @@ void reader::impl::prepare_data(int64_t skip_rows,
                                 host_span<std::vector<size_type> const> row_group_indices,
                                 std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  if (_file_preprocessed) { return; }
+  // if we have not preprocessed at the whole-file level, do that now
+  if (!_file_preprocessed) {
+    // if filter is not empty, then create output types as vector and pass for filtering.
+    std::vector<data_type> output_types;
+    if (filter.has_value()) {
+      std::transform(_output_buffers.cbegin(),
+                     _output_buffers.cend(),
+                     std::back_inserter(output_types),
+                     [](auto const& col) { return col.type; });
+    }
+    std::tie(
+      _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+      _metadata->select_row_groups(
+        row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+
+    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+        not _input_columns.empty()) {
+      // fills in chunk information without physically loading or decompressing
+      // the associated data
+      load_global_chunk_info();
+
+      // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
+      // we will read an entire row group at a time. However, it is possible to do
+      // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
+      // changed the high level structure such that we weren't always reading an entire table's
+      // worth of columns at once.
+      compute_input_pass_row_group_info();
+    }
 
-  // if filter is not empty, then create output types as vector and pass for filtering.
-  std::vector<data_type> output_types;
-  if (filter.has_value()) {
-    std::transform(_output_buffers.cbegin(),
-                   _output_buffers.cend(),
-                   std::back_inserter(output_types),
-                   [](auto const& col) { return col.type; });
+    _file_preprocessed = true;
   }
-  auto const [skip_rows_corrected, num_rows_corrected, row_groups_info] =
-    _metadata->select_row_groups(
-      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
-
-  if (num_rows_corrected > 0 && not row_groups_info.empty() && not _input_columns.empty()) {
-    load_and_decompress_data(row_groups_info, num_rows_corrected);
-    preprocess_pages(
-      skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit);
-
-    if (_chunk_read_limit == 0) {  // read the whole file at once
-      CUDF_EXPECTS(_chunk_read_info.size() == 1,
-                   "Reading the whole file should yield only one chunk.");
+
+  // if we have to start a new pass, do that now
+  if (!_pass_preprocessed) {
+    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+
+    // always create the pass struct, even if we end up with no passes.
+    // this will also cause the previous pass information to be deleted
+    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+
+    if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
+        not _input_columns.empty() && _current_input_pass < num_passes) {
+      // setup the pass_intermediate_info for this pass.
+      setup_pass();
+
+      load_and_decompress_data();
+      preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+
+      if (_output_chunk_read_limit == 0) {  // read the whole file at once
+        CUDF_EXPECTS(_pass_itm_data->output_chunk_read_info.size() == 1,
+                     "Reading the whole file should yield only one chunk.");
+      }
     }
-  }
 
-  _file_preprocessed = true;
+    _pass_preprocessed = true;
+  }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -382,11 +415,12 @@ table_with_metadata reader::impl::read_chunk_internal(
   auto out_columns = std::vector<std::unique_ptr<column>>{};
   out_columns.reserve(_output_buffers.size());
 
-  if (!has_next() || _chunk_read_info.empty()) {
+  if (!has_next() || _pass_itm_data->output_chunk_read_info.empty()) {
     return finalize_output(out_metadata, out_columns, filter);
   }
 
-  auto const& read_info = _chunk_read_info[_current_read_chunk++];
+  auto const& read_info =
+    _pass_itm_data->output_chunk_read_info[_pass_itm_data->current_output_chunk];
 
   // Allocate memory buffers for the output columns.
   allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
@@ -439,6 +473,17 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
+  // advance chunks/passes as necessary
+  _pass_itm_data->current_output_chunk++;
+  _chunk_count++;
+  if (_pass_itm_data->current_output_chunk >= _pass_itm_data->output_chunk_read_info.size()) {
+    _pass_itm_data->current_output_chunk = 0;
+    _pass_itm_data->output_chunk_read_info.clear();
+
+    _current_input_pass++;
+    _pass_preprocessed = false;
+  }
+
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
     auto predicate  = cudf::detail::compute_column(
@@ -458,7 +503,8 @@ table_with_metadata reader::impl::read(
   host_span<std::vector<size_type> const> row_group_indices,
   std::optional<std::reference_wrapper<ast::expression const>> filter)
 {
-  CUDF_EXPECTS(_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit.");
+  CUDF_EXPECTS(_output_chunk_read_limit == 0,
+               "Reading the whole file must not have non-zero byte_limit.");
   table_metadata metadata;
   populate_metadata(metadata);
   auto expr_conv     = named_to_reference_converter(filter, metadata);
@@ -472,7 +518,7 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_chunk_read_limit > 0) {
+  if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
@@ -494,7 +540,11 @@ bool reader::impl::has_next()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
-  return _current_read_chunk < _chunk_read_info.size();
+
+  auto const num_input_passes =
+    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
+         (_current_input_pass < num_input_passes);
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index a980670e465..9445e4d1648 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -90,17 +90,20 @@ class reader::impl {
    * ```
    *
    * Reading the whole given file at once through `read()` function is still supported if
-   * `chunk_read_limit == 0` (i.e., no reading limit).
-   * In such case, `read_chunk()` will also return rows of the entire file.
+   * `chunk_read_limit == 0` (i.e., no reading limit) and `pass_read_limit == 0` (no temporary
+   * memory limit) In such case, `read_chunk()` will also return rows of the entire file.
    *
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
+   * @param pass_read_limit Limit on memory usage for the purposes of decompression and processing
+   * of input, or `0` if there is no limit.
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::size_t chunk_read_limit,
+                std::size_t pass_read_limit,
                 std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
@@ -133,22 +136,22 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
+  void load_global_chunk_info();
+  void compute_input_pass_row_group_info();
+  void setup_pass();
+
   /**
    * @brief Create chunk information and start file reads
    *
-   * @param row_groups_info vector of information about row groups to read
-   * @param num_rows  Maximum number of rows to read
    * @return pair of boolean indicating if compressed chunks were found and a vector of futures for
    * read completion
    */
-  std::pair<bool, std::vector<std::future<void>>> create_and_read_column_chunks(
-    cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows);
+  std::pair<bool, std::vector<std::future<void>>> read_and_decompress_column_chunks();
 
   /**
    * @brief Load and decompress the input file(s) into memory.
    */
-  void load_and_decompress_data(cudf::host_span<row_group_info const> const row_groups_info,
-                                size_type num_rows);
+  void load_and_decompress_data();
 
   /**
    * @brief Perform some preprocessing for page data and also compute the split locations
@@ -161,17 +164,12 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @param skip_rows Crop all rows below skip_rows
-   * @param num_rows Maximum number of rows to read
    * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
    *        bounds
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_pages(size_t skip_rows,
-                        size_t num_rows,
-                        bool uses_custom_row_bounds,
-                        size_t chunk_read_limit);
+  void preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -278,12 +276,28 @@ class reader::impl {
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
   data_type _timestamp_type{type_id::EMPTY};
 
-  // Variables used for chunked reading:
+  // chunked reading happens in 2 parts:
+  //
+  // At the top level there is the "pass" in which we try and limit the
+  // total amount of temporary memory (compressed data, decompressed data) in use
+  // via _input_pass_read_limit.
+  //
+  // Within a pass, we produce one or more chunks of output, whose maximum total
+  // byte size is controlled by _output_chunk_read_limit.
+
   cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  cudf::io::parquet::gpu::chunk_intermediate_data _chunk_itm_data;
-  std::vector<cudf::io::parquet::gpu::chunk_read_info> _chunk_read_info;
-  std::size_t _chunk_read_limit{0};
-  std::size_t _current_read_chunk{0};
+  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> _input_pass_row_group_offsets{};
+  std::vector<std::size_t> _input_pass_row_count{};
+  std::size_t _current_input_pass{0};
+  std::size_t _chunk_count{0};
+
+  std::size_t _output_chunk_read_limit{0};
+  std::size_t _input_pass_read_limit{0};
+  bool _pass_preprocessed{false};
   bool _file_preprocessed{false};
 };
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index f6dbeb275fc..fcaa610fbb7 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -344,7 +344,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>>
+std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -362,7 +362,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<row_group_info> selection;
+  std::vector<gpu::row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 751ffc33123..61e4f94df0f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -53,19 +53,6 @@ using namespace cudf::io::parquet;
            : data_type{t_id};
 }
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type const index;
-  size_t const start_row;  // TODO source index
-  size_type const source_index;
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index(index), start_row(start_row), source_index(source_index)
-  {
-  }
-};
-
 /**
  * @brief Class for parsing dataset metadata
  */
@@ -194,7 +181,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a2db0de26bb..c731c467f2c 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -577,10 +577,10 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
 
 void reader::impl::allocate_nesting_info()
 {
-  auto const& chunks             = _file_itm_data.chunks;
-  auto& pages                    = _file_itm_data.pages_info;
-  auto& page_nesting_info        = _file_itm_data.page_nesting_info;
-  auto& page_nesting_decode_info = _file_itm_data.page_nesting_decode_info;
+  auto const& chunks             = _pass_itm_data->chunks;
+  auto& pages                    = _pass_itm_data->pages_info;
+  auto& page_nesting_info        = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode_info = _pass_itm_data->page_nesting_decode_info;
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
@@ -702,38 +702,39 @@ void reader::impl::allocate_nesting_info()
 
 void reader::impl::allocate_level_decode_space()
 {
-  auto& pages = _file_itm_data.pages_info;
+  auto& pages = _pass_itm_data->pages_info;
 
   // TODO: this could be made smaller if we ignored dictionary pages and pages with no
   // repetition data.
   size_t const per_page_decode_buf_size =
-    LEVEL_DECODE_BUF_SIZE * 2 * _file_itm_data.level_type_size;
+    LEVEL_DECODE_BUF_SIZE * 2 * _pass_itm_data->level_type_size;
   auto const decode_buf_size = per_page_decode_buf_size * pages.size();
-  _file_itm_data.level_decode_data =
+  _pass_itm_data->level_decode_data =
     rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
 
   // distribute the buffers
-  uint8_t* buf = static_cast<uint8_t*>(_file_itm_data.level_decode_data.data());
+  uint8_t* buf = static_cast<uint8_t*>(_pass_itm_data->level_decode_data.data());
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
     p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
     p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
-    buf += (LEVEL_DECODE_BUF_SIZE * _file_itm_data.level_type_size);
+    buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
 
-std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_column_chunks(
-  cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows)
+std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompress_column_chunks()
 {
-  auto& raw_page_data = _file_itm_data.raw_page_data;
-  auto& chunks        = _file_itm_data.chunks;
+  auto const& row_groups_info = _pass_itm_data->row_groups;
+  auto const num_rows         = _pass_itm_data->num_rows;
+
+  auto& raw_page_data = _pass_itm_data->raw_page_data;
+  auto& chunks        = _pass_itm_data->chunks;
 
   // Descriptors for all the chunks that make up the selected columns
   auto const num_input_columns = _input_columns.size();
   auto const num_chunks        = row_groups_info.size() * num_input_columns;
-  chunks = cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
 
   // Association between each column chunk and its source
   std::vector<size_type> chunk_source_map(num_chunks);
@@ -747,13 +748,68 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
   // Initialize column chunk information
   size_t total_decompressed_size = 0;
   auto remaining_rows            = num_rows;
-  std::vector<std::future<void>> read_rowgroup_tasks;
+  std::vector<std::future<void>> read_chunk_tasks;
+  size_type chunk_count = 0;
   for (auto const& rg : row_groups_info) {
     auto const& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start  = rg.start_row;
     auto const row_group_source = rg.source_index;
     auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
 
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto const& col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+
+      column_chunk_offsets[chunk_count] =
+        (col_meta.dictionary_page_offset != 0)
+          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
+          : col_meta.data_page_offset;
+
+      // Map each column chunk to its column index and its source index
+      chunk_source_map[chunk_count] = row_group_source;
+
+      if (col_meta.codec != Compression::UNCOMPRESSED) {
+        total_decompressed_size += col_meta.total_uncompressed_size;
+      }
+
+      chunk_count++;
+    }
+    remaining_rows -= row_group_rows;
+  }
+
+  // Read compressed chunk data to device memory
+  read_chunk_tasks.push_back(read_column_chunks_async(_sources,
+                                                      raw_page_data,
+                                                      chunks,
+                                                      0,
+                                                      chunks.size(),
+                                                      column_chunk_offsets,
+                                                      chunk_source_map,
+                                                      _stream));
+
+  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+
+  return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
+}
+
+void reader::impl::load_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
     // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
     for (size_t i = 0; i < num_input_columns; ++i) {
       auto col = _input_columns[i];
@@ -768,11 +824,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
                         schema.converted_type,
                         schema.type_length);
 
-      column_chunk_offsets[chunks.size()] =
-        (col_meta.dictionary_page_offset != 0)
-          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
-          : col_meta.data_page_offset;
-
       chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
                                             nullptr,
                                             col_meta.num_values,
@@ -792,92 +843,171 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::create_and_read_co
                                             clock_rate,
                                             i,
                                             col.schema_idx));
-
-      // Map each column chunk to its column index and its source index
-      chunk_source_map[chunks.size() - 1] = row_group_source;
-
-      if (col_meta.codec != Compression::UNCOMPRESSED) {
-        total_decompressed_size += col_meta.total_uncompressed_size;
-      }
     }
+
     remaining_rows -= row_group_rows;
   }
+}
 
-  // Read compressed chunk data to device memory
-  read_rowgroup_tasks.push_back(read_column_chunks_async(_sources,
-                                                         raw_page_data,
-                                                         chunks,
-                                                         0,
-                                                         chunks.size(),
-                                                         column_chunk_offsets,
-                                                         chunk_source_map,
-                                                         _stream));
+void reader::impl::compute_input_pass_row_group_info()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _input_pass_row_group_offsets.push_back(0);
+    _input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
 
-  CUDF_EXPECTS(remaining_rows == 0, "All rows data must be read.");
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _input_pass_row_group_offsets.push_back(0);
+  _input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _input_pass_row_group_offsets.push_back(cur_rg_index);
+        _input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _input_pass_row_count.push_back(cur_row_count);
+  }
+}
 
-  return {total_decompressed_size > 0, std::move(read_rowgroup_tasks)};
+void reader::impl::setup_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks =
+    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows  = end_row - start_row;
+  }
 }
 
-void reader::impl::load_and_decompress_data(
-  cudf::host_span<row_group_info const> const row_groups_info, size_type num_rows)
+void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
-  CUDF_EXPECTS(num_rows > 0, "Number of reading rows must not be zero.");
+  CUDF_EXPECTS(_pass_itm_data->num_rows > 0, "Number of reading rows must not be zero.");
 
-  auto& raw_page_data    = _file_itm_data.raw_page_data;
-  auto& decomp_page_data = _file_itm_data.decomp_page_data;
-  auto& chunks           = _file_itm_data.chunks;
-  auto& pages            = _file_itm_data.pages_info;
+  auto& raw_page_data    = _pass_itm_data->raw_page_data;
+  auto& decomp_page_data = _pass_itm_data->decomp_page_data;
+  auto& chunks           = _pass_itm_data->chunks;
+  auto& pages            = _pass_itm_data->pages_info;
 
-  auto const [has_compressed_data, read_rowgroup_tasks] =
-    create_and_read_column_chunks(row_groups_info, num_rows);
+  auto const [has_compressed_data, read_chunks_tasks] = read_and_decompress_column_chunks();
 
-  for (auto& task : read_rowgroup_tasks) {
+  for (auto& task : read_chunks_tasks) {
     task.wait();
   }
 
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
+  if (total_pages <= 0) { return; }
   pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
 
-  if (total_pages > 0) {
-    // decoding of column/page information
-    _file_itm_data.level_type_size = decode_page_headers(chunks, pages, _stream);
-    if (has_compressed_data) {
-      decomp_page_data = decompress_page_data(chunks, pages, _stream);
-      // Free compressed data
-      for (size_t c = 0; c < chunks.size(); c++) {
-        if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
-      }
+  // decoding of column/page information
+  _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
+  if (has_compressed_data) {
+    decomp_page_data = decompress_page_data(chunks, pages, _stream);
+    // Free compressed data
+    for (size_t c = 0; c < chunks.size(); c++) {
+      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
+  }
 
-    // build output column info
-    // walk the schema, building out_buffers that mirror what our final cudf columns will look
-    // like. important : there is not necessarily a 1:1 mapping between input columns and output
-    // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-    // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-    // "name" is a struct column that we want to return, so we have to make sure that we
-    // create it ourselves.
-    // std::vector<output_column_info> output_info = build_output_column_info();
-
-    // the following two allocate functions modify the page data
-    pages.device_to_host_sync(_stream);
-    {
-      // nesting information (sizes, etc) stored -per page-
-      // note : even for flat schemas, we allocate 1 level of "nesting" info
-      allocate_nesting_info();
+  // build output column info
+  // walk the schema, building out_buffers that mirror what our final cudf columns will look
+  // like. important : there is not necessarily a 1:1 mapping between input columns and output
+  // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
+  // columns. The "structiness" is simply implied by the schema.  For example, this schema:
+  //  required group field_id=1 name {
+  //    required binary field_id=2 firstname (String);
+  //    required binary field_id=3 middlename (String);
+  //    required binary field_id=4 lastname (String);
+  // }
+  // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
+  // "name" is a struct column that we want to return, so we have to make sure that we
+  // create it ourselves.
+  // std::vector<output_column_info> output_info = build_output_column_info();
+
+  // the following two allocate functions modify the page data
+  pages.device_to_host_sync(_stream);
+  {
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
 
-      // level decode space
-      allocate_level_decode_space();
-    }
-    pages.host_to_device_async(_stream);
+    // level decode space
+    allocate_level_decode_space();
   }
+  pages.host_to_device_async(_stream);
 }
 
 namespace {
@@ -1183,7 +1313,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  */
 std::vector<gpu::chunk_read_info> compute_splits(
   cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::chunk_intermediate_data const& id,
+  gpu::pass_intermediate_data const& id,
   size_t num_rows,
   size_t chunk_read_limit,
   rmm::cuda_stream_view stream)
@@ -1539,13 +1669,12 @@ struct page_offset_output_iter {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_pages(size_t skip_rows,
-                                    size_t num_rows,
-                                    bool uses_custom_row_bounds,
-                                    size_t chunk_read_limit)
+void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
 {
-  auto& chunks = _file_itm_data.chunks;
-  auto& pages  = _file_itm_data.pages_info;
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
+  auto& chunks         = _pass_itm_data->chunks;
+  auto& pages          = _pass_itm_data->pages_info;
 
   // compute page ordering.
   //
@@ -1636,7 +1765,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
 
     // Build index for string dictionaries since they can't be indexed
     // directly due to variable-sized elements
-    _chunk_itm_data.str_dict_index =
+    _pass_itm_data->str_dict_index =
       cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
         total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
@@ -1646,7 +1775,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
       CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
                    "Column/page schema index mismatch");
       if (is_dict_chunk(chunks[c])) {
-        chunks[c].str_dict_index = _chunk_itm_data.str_dict_index.data() + str_ofs;
+        chunks[c].str_dict_index = _pass_itm_data->str_dict_index.data() + str_ofs;
         str_ofs += pages[page_count].num_input_values;
       }
 
@@ -1677,7 +1806,7 @@ void reader::impl::preprocess_pages(size_t skip_rows,
                           std::numeric_limits<size_t>::max(),
                           true,                  // compute num_rows
                           chunk_read_limit > 0,  // compute string sizes
-                          _file_itm_data.level_type_size,
+                          _pass_itm_data->level_type_size,
                           _stream);
 
     // computes:
@@ -1699,20 +1828,21 @@ void reader::impl::preprocess_pages(size_t skip_rows,
   }
 
   // preserve page ordering data for string decoder
-  _chunk_itm_data.page_keys  = std::move(page_keys);
-  _chunk_itm_data.page_index = std::move(page_index);
+  _pass_itm_data->page_keys  = std::move(page_keys);
+  _pass_itm_data->page_index = std::move(page_index);
 
   // compute splits if necessary. otherwise return a single split representing
   // the whole file.
-  _chunk_read_info = chunk_read_limit > 0
-                       ? compute_splits(pages, _chunk_itm_data, num_rows, chunk_read_limit, _stream)
-                       : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+  _pass_itm_data->output_chunk_read_info =
+    _output_chunk_read_limit > 0
+      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
+      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
 {
-  auto const& chunks = _file_itm_data.chunks;
-  auto& pages        = _file_itm_data.pages_info;
+  auto const& chunks = _pass_itm_data->chunks;
+  auto& pages        = _pass_itm_data->pages_info;
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to parse");
@@ -1729,7 +1859,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                           num_rows,
                           false,  // num_rows is already computed
                           false,  // no need to compute string sizes
-                          _file_itm_data.level_type_size,
+                          _pass_itm_data->level_type_size,
                           _stream);
 
     // print_pages(pages, _stream);
@@ -1766,7 +1896,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    auto& page_index = _chunk_itm_data.page_index;
+    auto& page_index = _pass_itm_data->page_index;
 
     std::vector<input_col_info> h_cols_info;
     h_cols_info.reserve(_input_columns.size());
@@ -1846,10 +1976,10 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
 {
-  auto& chunks           = _file_itm_data.chunks;
-  auto& pages            = _file_itm_data.pages_info;
-  auto const& page_keys  = _chunk_itm_data.page_keys;
-  auto const& page_index = _chunk_itm_data.page_index;
+  auto& chunks           = _pass_itm_data->chunks;
+  auto& pages            = _pass_itm_data->pages_info;
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
 
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp
index 9815304b965..05fb9a3ec48 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cpp
+++ b/cpp/tests/io/parquet_chunked_reader_test.cpp
@@ -100,11 +100,13 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
   return std::pair{std::move(input_table), std::move(filepath)};
 }
 
-auto chunked_read(std::string const& filepath, std::size_t byte_limit)
+auto chunked_read(std::string const& filepath,
+                  std::size_t output_limit,
+                  std::size_t input_limit = 0)
 {
   auto const read_opts =
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
-  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
+  auto reader = cudf::io::chunked_parquet_reader(output_limit, input_limit, read_opts);
 
   auto num_chunks = 0;
   auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
@@ -950,3 +952,65 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
     EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), page_limit_rows / 4);
   } while (reader.has_next());
 }
+
+TEST_F(ParquetChunkedReaderTest, InputLimitSimple)
+{
+  auto const filepath = temp_env->get_temp_filepath("input_limit_10_rowgroups.parquet");
+
+  // This results in 10 grow groups, at 4001150 bytes per row group
+  constexpr int num_rows = 25'000'000;
+  auto value_iter = cudf::detail::make_counting_transform_iterator(0, [](int i) { return i; });
+  cudf::test::fixed_width_column_wrapper<int> expected(value_iter, value_iter + num_rows);
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath},
+                                              cudf::table_view{{expected}})
+      // note: it is unnecessary to force compression to NONE here because the size we are using in
+      // the row group is the uncompressed data size. But forcing the dictionary policy to
+      // dictionary_policy::NEVER is necessary to prevent changes in the
+      // decompressed-but-not-yet-decoded data.
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+
+  cudf::io::write_parquet(opts);
+
+  {
+    // no chunking
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 1);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4000000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 25 chunks of 100k rows each
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 4100000);
+    EXPECT_EQ(num_chunks, 25);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 12 chunks of 200k rows each, plus 1 final chunk of 100k rows.
+    auto const [result, num_chunks] = chunked_read(filepath, 0, 8002301);
+    EXPECT_EQ(num_chunks, 13);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+
+  {
+    // 1 big chunk
+    auto const [result, num_chunks] = chunked_read(filepath, 0, size_t{1} * 1024 * 1024 * 1024);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0));
+  }
+}

From 23d24d43fac8615166c38231f13fe8751a8aec42 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz+github@gmail.com>
Date: Thu, 28 Sep 2023 19:08:55 +0200
Subject: [PATCH 22/29] Add `bytes_per_second` to distinct_count of
 stream_compaction nvbench. (#14172)

This patch relates to #13735.

Benchmark:  [benchmark_distinct_count.txt](https://github.com/rapidsai/cudf/files/12700496/benchmark_distinct_count.txt)

Authors:
  - Martin Marenz (https://github.com/Blonck)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14172
---
 cpp/benchmarks/stream_compaction/distinct_count.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/benchmarks/stream_compaction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp
index 2b2c901b90f..3e324013d4e 100644
--- a/cpp/benchmarks/stream_compaction/distinct_count.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp
@@ -40,6 +40,14 @@ static void bench_distinct_count(nvbench::state& state, nvbench::type_list<Type>
   auto const& data_column = data_table->get_column(0);
   auto const input_table  = cudf::table_view{{data_column, data_column, data_column}};
 
+  // Collect memory statistics for input and output.
+  state.add_global_memory_reads<Type>(input_table.num_rows() * input_table.num_columns());
+  state.add_global_memory_writes<cudf::size_type>(1);
+  if (null_probability > 0) {
+    state.add_global_memory_reads<nvbench::int8_t>(
+      input_table.num_columns() * cudf::bitmask_allocation_size_bytes(input_table.num_rows()));
+  }
+
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {

From b2f00809f40e2e81b01214177b412456d40404cc Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 28 Sep 2023 12:16:29 -0500
Subject: [PATCH 23/29] Pin dask and distributed for 23.10 release (#14225)

This PR pins `dask` and `distributed` to `2023.9.2` for `23.10` release.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
   - Peter Andreas Entschev (https://github.com/pentschev)
---
 ci/test_wheel_dask_cudf.sh                       |  2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  6 +++---
 conda/environments/all_cuda-120_arch-x86_64.yaml |  6 +++---
 conda/recipes/custreamz/meta.yaml                |  6 +++---
 conda/recipes/dask-cudf/meta.yaml                | 12 ++++++------
 conda/recipes/dask-cudf/run_test.sh              |  4 ++--
 dependencies.yaml                                |  6 +++---
 python/dask_cudf/pyproject.toml                  |  4 ++--
 8 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index d6e7f4bf65e..0abee09ca8a 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 9fb991f9075..46b0b3799f2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,10 +25,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9ba0dd8dc38..0e137c91120 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,10 +26,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core>=2023.7.1
+- dask-core==2023.9.2
 - dask-cuda==23.10.*
-- dask>=2023.7.1
-- distributed>=2023.7.1
+- dask==2023.9.2
+- distributed==2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 7aaa40bffd0..233d51baf31 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,9 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 12809ba648f..4c8af071074 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -38,16 +38,16 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.7.1
-    - dask-core >=2023.7.1
-    - distributed >=2023.7.1
+    - dask ==2023.9.2
+    - dask-core ==2023.9.2
+    - distributed ==2023.9.2
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 7dc54747a0c..c79c014a89a 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -18,10 +18,10 @@ if [ "${ARCH}" = "aarch64" ]; then
 fi
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.7.1"
+export DASK_STABLE_VERSION="2023.9.2"
 
 # Install the conda-forge or nightly version of dask and distributed
 if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
diff --git a/dependencies.yaml b/dependencies.yaml
index 5586f54348c..b21472df4fd 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -491,12 +491,12 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.7.1
-          - distributed>=2023.7.1
+          - dask==2023.9.2
+          - distributed==2023.9.2
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.7.1  # dask-core in conda is the actual package & dask is the meta package
+          - dask-core==2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.10.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 922da366422..41b57b71749 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,8 +20,8 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.10.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.7.1",
-    "distributed>=2023.7.1",
+    "dask==2023.9.2",
+    "distributed==2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",

From 59b09fd097e39bd15646eac1156889692974dc5f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Sep 2023 11:10:25 -0500
Subject: [PATCH 24/29] cuDF: Build CUDA 12.0 ARM conda packages. (#14112)

This PR builds conda packages using CUDA 12 on ARM.

This work is targeting 23.12 and depends on https://github.com/rapidsai/rmm/pull/1330.

Closes #14128.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14112
---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 dependencies.yaml            | 20 ++------------------
 4 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ab028eb89cc..dc2c81d1c77 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 214f9c90b41..047b80f2e5c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ca32bcfe03..e58227c30dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/dependencies.yaml b/dependencies.yaml
index c8ee66bd99f..c19e8765be3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -227,25 +227,9 @@ dependencies:
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
+          # Align nvcomp version with rapids-cmake
+          - nvcomp==2.6.1
           - spdlog>=1.11.0,<1.12
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              # Align nvcomp version with rapids-cmake
-              # TODO: not yet available for aarch64 CUDA 12
-              - &nvcomp nvcomp==2.6.1
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - *nvcomp
-          # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
-          # all CUDA/arch combinations should be supported by existing packages.
-          - matrix:
-            packages:
   build_wheels:
     common:
       - output_types: pyproject

From 66a655ce80e8b0accb80ea4e23799d23a82a35a2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 08:00:44 -0500
Subject: [PATCH 25/29] Fix inaccuracy in decimal128 rounding. (#14233)

Fixes a bug where floating-point values were used in decimal128 rounding, giving wrong results.

Closes https://github.com/rapidsai/cudf/issues/14210.

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Divye Gala (https://github.com/divyegala)
   - Mark Harris (https://github.com/harrism)
---
 cpp/src/round/round.cu          |  5 ++-
 cpp/tests/round/round_tests.cpp | 79 +++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 4b3f80fc6e2..41cce57d55b 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -271,7 +271,10 @@ std::unique_ptr<column> round_with(column_view const& input,
                                out_view.template end<Type>(),
                                static_cast<Type>(0));
   } else {
-    Type const n = std::pow(10, scale_movement);
+    Type n = 10;
+    for (int i = 1; i < scale_movement; ++i) {
+      n *= 10;
+    }
     thrust::transform(rmm::exec_policy(stream),
                       input.begin<Type>(),
                       input.end<Type>(),
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index d802c0c2706..f97bb7a5323 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -703,4 +703,83 @@ TEST_F(RoundTests, BoolTestHalfUp)
   EXPECT_THROW(cudf::round(input, -2, cudf::rounding_method::HALF_UP), cudf::logic_error);
 }
 
+// Use __uint128_t for demonstration.
+constexpr __uint128_t operator""_uint128_t(const char* s)
+{
+  __uint128_t ret = 0;
+  for (int i = 0; s[i] != '\0'; ++i) {
+    ret *= 10;
+    if ('0' <= s[i] && s[i] <= '9') { ret += s[i] - '0'; }
+  }
+  return ret;
+}
+
+TEST_F(RoundTests, HalfEvenErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.5 at scale -37 should round HALF_EVEN to 0, because 0 is an even number
+    auto const input =
+      fp_wrapper{{5000000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{0}, scale_type{0}};
+    auto const result   = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsB)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.125 at scale -37 should round HALF_EVEN to 0.12, because 2 is an even number
+    auto const input =
+      fp_wrapper{{1250000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{12}, scale_type{-2}};
+    auto const result   = cudf::round(input, 2, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfEvenErrorsC)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.0625 at scale -37 should round HALF_EVEN to 0.062, because 2 is an even number
+    auto const input =
+      fp_wrapper{{0625000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{62}, scale_type{-3}};
+    auto const result   = cudf::round(input, 3, cudf::rounding_method::HALF_EVEN);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(RoundTests, HalfUpErrorsA)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  {
+    // 0.25 at scale -37 should round HALF_UP to 0.3
+    auto const input =
+      fp_wrapper{{2500000000000000000000000000000000000_uint128_t}, scale_type{-37}};
+    auto const expected = fp_wrapper{{3}, scale_type{-1}};
+    auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 3964950ba2fecf7f962917276058a6381d396246 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 3 Oct 2023 15:11:15 -0500
Subject: [PATCH 26/29] Fix inaccurate ceil/floor and inaccurate rescaling
 casts of fixed-point values. (#14242)

This is a follow-up PR to #14233. This PR fixes a bug where floating-point values were used as intermediates in ceil/floor unary operations and cast operations that require rescaling for fixed-point types, giving inaccurate results.

See also:
- https://github.com/rapidsai/cudf/pull/14233#discussion_r1340786769
- https://github.com/rapidsai/cudf/issues/14243

Authors:
   - Bradley Dice (https://github.com/bdice)

Approvers:
   - Mike Wilson (https://github.com/hyperbolic2346)
   - Vukasin Milovanovic (https://github.com/vuule)
---
 cpp/src/unary/cast_ops.cu          |  8 +++++-
 cpp/src/unary/math_ops.cu          |  8 ++++--
 cpp/tests/unary/cast_tests.cpp     | 40 ++++++++++++++++++++++++++++++
 cpp/tests/unary/unary_ops_test.cpp | 33 ++++++++++++++++++++++++
 4 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index f40ace8d10b..1c81f266200 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -199,7 +199,13 @@ std::unique_ptr<column> rescale(column_view input,
       }
       return output_column;
     }
-    auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, stream);
+
+    RepType scalar_value = 10;
+    for (int i = 1; i < -diff; ++i) {
+      scalar_value *= 10;
+    }
+
+    auto const scalar = make_fixed_point_scalar<T>(scalar_value, scale_type{diff}, stream);
     return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 961f3a9e720..d0cae81a9c8 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -295,7 +295,11 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
     input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
 
   auto out_view = result->mutable_view();
-  Type const n  = std::pow(10, -input.type().scale());
+
+  Type n = 10;
+  for (int i = 1; i < -input.type().scale(); ++i) {
+    n *= 10;
+  }
 
   thrust::transform(rmm::exec_policy(stream),
                     input.begin<Type>(),
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 9506e1918c0..d565359a4ea 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -30,6 +30,8 @@
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 #include <type_traits>
 #include <vector>
 
@@ -967,6 +969,44 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTests, ValidateCastRescalePrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rescaling that occurs
+  // during casting to different scales should only use fixed-precision math.
+  // Realistically, we are only able to show precision failures due to floating
+  // conversion in a few very specific circumstances where dividing by specific
+  // powers of 10 works against us.  Some examples: 10^23, 10^25, 10^26, 10^27,
+  // 10^30, 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation.
+  // For completeness and to ensure that we are not missing any other cases, we
+  // test casting to/from all scales in the range of each decimal type. Values
+  // that are powers of ten show this error more readily than non-powers of 10
+  // because the rescaling factor is a power of 10, meaning that errors in
+  // division are more visible.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    for (int result_scale = 0; result_scale >= min_scale; --result_scale) {
+      RepType input_value = 1;
+      for (int k = 0; k > input_scale; --k) {
+        input_value *= 10;
+      }
+      RepType result_value = 1;
+      for (int k = 0; k > result_scale; --k) {
+        result_value *= 10;
+      }
+      auto const input    = fp_wrapper{{input_value}, scale_type{input_scale}};
+      auto const expected = fp_wrapper{{result_value}, scale_type{result_scale}};
+      auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(result_scale));
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+    }
+  }
+}
+
 TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
 {
   using namespace numeric;
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 49764f22373..76d1f769856 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -24,6 +24,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cuda/std/limits>
+
 template <typename T>
 cudf::test::fixed_width_column_wrapper<T> create_fixed_columns(cudf::size_type start,
                                                                cudf::size_type size,
@@ -372,4 +374,35 @@ TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryFloorLarge)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointUnaryTests, ValidateCeilFloorPrecision)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  // This test is designed to protect against floating point conversion
+  // introducing errors in fixed-point arithmetic. The rounding that occurs
+  // during ceil/floor should only use fixed-precision math. Realistically,
+  // we are only able to show precision failures due to floating conversion in
+  // a few very specific circumstances where dividing by specific powers of 10
+  // works against us.  Some examples: 10^23, 10^25, 10^26, 10^27, 10^30,
+  // 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. For
+  // completeness and to ensure that we are not missing any other cases, we
+  // test all scales representable in the range of each decimal type.
+  constexpr auto min_scale = -cuda::std::numeric_limits<RepType>::digits10;
+  for (int input_scale = 0; input_scale >= min_scale; --input_scale) {
+    RepType input_value = 1;
+    for (int k = 0; k > input_scale; --k) {
+      input_value *= 10;
+    }
+    auto const input       = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const expected    = fp_wrapper{{input_value}, scale_type{input_scale}};
+    auto const ceil_result = cudf::unary_operation(input, cudf::unary_operator::CEIL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, ceil_result->view());
+    auto const floor_result = cudf::unary_operation(input, cudf::unary_operator::FLOOR);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, floor_result->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 29556a2514f4d274164a27a80539410da7e132d6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 3 Oct 2023 14:44:28 -0700
Subject: [PATCH 27/29] Remove the use of volatile in ORC (#14175)

`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in ORC reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14175
---
 cpp/src/io/orc/stats_enc.cu   |  4 +-
 cpp/src/io/orc/stripe_data.cu | 82 +++++++++++++++--------------------
 cpp/src/io/orc/stripe_enc.cu  | 14 +++---
 cpp/src/io/orc/stripe_init.cu |  2 +-
 4 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 95f1db5bfd1..479a2dfada3 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -76,8 +76,8 @@ __global__ void __launch_bounds__(block_size, 1)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
-  volatile uint32_t stats_size = 0;
-  auto t                       = threadIdx.x;
+  uint32_t stats_size = 0;
+  auto t              = threadIdx.x;
   __syncthreads();
   for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 3edcd3d83b2..0b249bbdafe 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -142,9 +142,7 @@ struct orcdec_state_s {
  * @param[in] base Pointer to raw byte stream data
  * @param[in] len Stream length in bytes
  */
-static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
-                                       uint8_t const* base,
-                                       uint32_t len)
+static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
@@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
  */
-static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs,
-                                              uint32_t bytes_consumed)
+static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed)
 {
   uint32_t pos     = bs->pos;
   uint32_t len     = bs->len;
@@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t)
  * @param[in] pos Position in byte stream
  * @return byte
  */
-inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
 }
@@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int
  * @param[in] pos Position in byte stream
  * @result bits
  */
-inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return bits
  */
-inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b    = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
-                                               int bitpos,
-                                               uint32_t numbits)
+inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx    = bitpos >> 5;
   uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
-                                                 int bitpos,
-                                                 uint32_t numbits)
+inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx       = bitpos >> 5;
   uint32_t a    = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint32_t& result)
@@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int32_t& result)
@@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint64_t& result)
@@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int64_t& result)
@@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @return length of varint in bytes
  */
 template <class T>
-inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos)
 {
   if (bytestream_readbyte(bs, pos) > 0x7f) {
     uint32_t next32 = bytestream_readu32(bs, pos + 1);
@@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
  * @return new position in byte stream buffer
  */
 template <class T>
-inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result)
+inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result)
 {
   uint32_t v = bytestream_readbyte(bs, pos++);
   if (v > 0x7f) {
@@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos)
 {
   auto byte                  = bytestream_readbyte(bs, pos++);
   __int128_t const sign_mask = -(int32_t)(byte & 1);
@@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int
 /**
  * @brief Decodes an unsigned 32-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Decodes an unsigned 64-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Signed version of 32-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3
 /**
  * @brief Signed version of 64-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6
  * @return number of values decoded
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
  * @return number of values decoded
  */
 template <class T>
-static __device__ uint32_t Integer_RLEv1(
-  orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t)
+static __device__ uint32_t
+Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   if (t == 0) {
@@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  */
 template <class T>
 static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
-                                         volatile orc_rlev2_state_s* rle,
-                                         volatile T* vals,
+                                         orc_rlev2_state_s* rle,
+                                         T* vals,
                                          uint32_t maxvals,
                                          int t,
                                          bool has_buffered_values = false)
@@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
  *
  * @return 32-bit value
  */
-inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos)
+inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
   uint32_t b = vals[(bitpos >> 5) + 1];
@@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi
  *
  * @return number of values decoded
  */
-static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs,
-                                    volatile orc_byterle_state_s* rle,
-                                    volatile uint8_t* vals,
-                                    uint32_t maxvals,
-                                    int t)
+static __device__ uint32_t
+Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   int r, tr;
@@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  * @return number of values decoded
  */
 static __device__ int Decode_Decimals(orc_bytestream_s* bs,
-                                      volatile orc_byterle_state_s* scratch,
-                                      volatile orcdec_state_s::values& vals,
+                                      orc_byterle_state_s* scratch,
+                                      orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
                                       type_id dtype_id,
@@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size)
       }
       __syncthreads();
       while (s->top.dict.dict_len > 0) {
-        uint32_t numvals        = min(s->top.dict.dict_len, blockDim.x), len;
-        volatile uint32_t* vals = s->vals.u32;
+        uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len;
+        uint32_t* vals   = s->vals.u32;
         bytestream_fill(&s->bs, t);
         __syncthreads();
         if (is_rlev1(s->chunk.encoding_kind)) {
@@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
     if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
-      uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
-      uint32_t valid = (t < nrows && r < rmax)
-                         ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
-                         : 0;
-      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      uint32_t rmax       = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
+      auto r              = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      uint32_t valid      = (t < nrows && r < rmax)
+                              ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                              : 0;
+      auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 73c41e2bbcd..4841fb1141a 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -53,7 +53,7 @@ constexpr bool zero_pll_war = true;
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
-  volatile uint32_t rpt_map[(512 / 32) + 1];
+  uint32_t rpt_map[(512 / 32) + 1];
 };
 
 struct intrle_enc_state_s {
@@ -63,7 +63,7 @@ struct intrle_enc_state_s {
   uint32_t literal_w;
   uint32_t hdr_bytes;
   uint32_t pl_bytes;
-  volatile uint32_t delta_map[(512 / 32) + 1];
+  uint32_t delta_map[(512 / 32) + 1];
 };
 
 struct strdata_enc_state_s {
@@ -366,7 +366,7 @@ static __device__ uint32_t IntegerRLE(
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt   = 0;
-  __shared__ volatile uint64_t block_vmin;
+  __shared__ uint64_t block_vmin;
 
   while (numvals > 0) {
     T v0               = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0;
@@ -615,7 +615,7 @@ static __device__ void StoreStringData(uint8_t* dst,
  * @param[in] t thread id
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -1143,7 +1143,7 @@ __global__ void __launch_bounds__(256)
                            uint32_t comp_block_align)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t* volatile uncomp_base_g;
+  __shared__ uint8_t* uncomp_base_g;
 
   auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
   auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
@@ -1196,8 +1196,8 @@ __global__ void __launch_bounds__(1024)
                              uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t const* volatile comp_src_g;
-  __shared__ uint32_t volatile comp_len_g;
+  __shared__ uint8_t const* comp_src_g;
+  __shared__ uint32_t comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 8eeca504121..b31a4a081d1 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
           : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
         ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
-          ((volatile uint32_t*)&s->rowgroups[i])[j];
+          ((uint32_t*)&s->rowgroups[i])[j];
       }
       row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows;
       // Updating in case of struct

From d87e181daa67d8fb1a029fc2c09e2f561d1e7234 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 4 Oct 2023 13:25:56 -0700
Subject: [PATCH 28/29] Expose streams in binaryop APIs (#14187)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14187
---
 cpp/include/cudf/binaryop.hpp           |   8 ++
 cpp/src/binaryop/binaryop.cpp           |  12 ++-
 cpp/src/binaryop/compiled/binary_ops.cu |   6 +-
 cpp/tests/CMakeLists.txt                |   1 +
 cpp/tests/streams/binaryop_test.cpp     | 126 ++++++++++++++++++++++++
 5 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 cpp/tests/streams/binaryop_test.cpp

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 77d6a4d1e89..9df4b4eb00f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -102,6 +102,7 @@ enum class binary_operator : int32_t {
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -115,6 +116,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -131,6 +133,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand scalar
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -144,6 +147,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -158,6 +162,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -172,6 +177,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +195,7 @@ std::unique_ptr<column> binary_operation(
  * @param output_type The desired data type of the output column. It is assumed
  *                    that output_type is compatible with the output data type
  *                    of the function in the PTX code
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -201,6 +208,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index ef07de8c461..6b413ab2be4 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -405,38 +405,42 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          std::string const& ptx,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1f7f342632a..85ab5c6d6cb 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -47,14 +47,16 @@ namespace {
 struct scalar_as_column_view {
   using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
-  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
                              1,
                              h_scalar_type_view.data(),
                              reinterpret_cast<bitmask_type const*>(s.validity_data()),
-                             !s.is_valid());
+                             !s.is_valid(stream));
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 04939f3cd6d..ac13c121530 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -622,6 +622,7 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
+ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
new file mode 100644
index 00000000000..2520aed0458
--- /dev/null
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class BinaryopTest : public cudf::test::BaseFixture {};
+
+TEST_F(BinaryopTest, ColumnColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ColumnScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::numeric_scalar<int32_t> rhs{23, true, cudf::test::get_default_stream()};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ScalarColumn)
+{
+  cudf::numeric_scalar<int32_t> lhs{42, true, cudf::test::get_default_stream()};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+class BinaryopPTXTest : public BinaryopTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
+};
+
+TEST_F(BinaryopPTXTest, ColumnColumnPTX)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs{15, 25, 35, 45, 55};
+
+  // c = a*a*a + b*b
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Eix
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eix(
+	.param .b64 _ZN8__main__7add$241Eix_param_0,
+	.param .b32 _ZN8__main__7add$241Eix_param_1,
+	.param .b64 _ZN8__main__7add$241Eix_param_2
+)
+{
+	.reg .b32 	%r<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Eix_param_0];
+	ld.param.u32 	%r1, [_ZN8__main__7add$241Eix_param_1];
+	ld.param.u64 	%rd2, [_ZN8__main__7add$241Eix_param_2];
+	cvt.s64.s32	%rd3, %r1;
+	mul.wide.s32 	%rd4, %r1, %r1;
+	mul.lo.s64 	%rd5, %rd4, %rd3;
+	mul.lo.s64 	%rd6, %rd2, %rd2;
+	add.s64 	%rd7, %rd6, %rd5;
+	st.u64 	[%rd1], %rd7;
+	mov.u32 	%r2, 0;
+	st.param.b32	[func_retval0+0], %r2;
+	ret;
+}
+
+)***";
+
+  cudf::binary_operation(
+    lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int32_t>()), cudf::test::get_default_stream());
+  cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int64_t>()));
+}

From b120f7e73e882b4eaa6b5a2cb91aeed20bf1198d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 4 Oct 2023 14:23:24 -0700
Subject: [PATCH 29/29] Improve `contains_column` by invoking `contains_table`
 (#14238)

Part of #https://github.com/rapidsai/cudf/issues/12261

This PR simplifies the `contains_column` implementation by invoking `contains_table` and gets rid of the use of the cudf `unordered_multiset`. It also removes the `unordered_multiset` header file from libcudf.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14238
---
 cpp/src/hash/unordered_multiset.cuh | 159 ----------------------------
 cpp/src/search/contains_column.cu   |  67 +-----------
 2 files changed, 1 insertion(+), 225 deletions(-)
 delete mode 100644 cpp/src/hash/unordered_multiset.cuh

diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
deleted file mode 100644
index 183042fc0f4..00000000000
--- a/cpp/src/hash/unordered_multiset.cuh
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <cuda/atomic>
-
-namespace cudf {
-namespace detail {
-/*
- *  Device view of the unordered multiset
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset_device_view {
- public:
-  unordered_multiset_device_view(size_type hash_size,
-                                 size_type const* hash_begin,
-                                 Element const* hash_data)
-    : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
-  {
-  }
-
-  bool __device__ contains(Element e) const
-  {
-    size_type loc = hasher(e) % (2 * hash_size);
-
-    for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) {
-      if (equals(hash_data[i], e)) return true;
-    }
-
-    return false;
-  }
-
- private:
-  Hasher hasher;
-  Equality equals;
-  size_type hash_size;
-  size_type const* hash_begin;
-  Element const* hash_data;
-};
-
-/*
- * Fixed size set on a device.
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset {
- public:
-  /**
-   * @brief Factory to construct a new unordered_multiset
-   */
-  static unordered_multiset<Element> create(column_view const& col, rmm::cuda_stream_view stream)
-  {
-    auto d_column = column_device_view::create(col, stream);
-    auto d_col    = *d_column;
-
-    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
-
-    Hasher hasher;
-    size_type* d_hash_bins_start = hash_bins_start.data();
-    size_type* d_hash_bins_end   = hash_bins_end.data();
-    Element* d_hash_data         = hash_data.data();
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
-          ref.fetch_add(1, cuda::std::memory_order_relaxed);
-        }
-      });
-
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           hash_bins_start.begin(),
-                           hash_bins_start.end(),
-                           hash_bins_end.begin());
-
-    thrust::copy(rmm::exec_policy(stream),
-                 hash_bins_end.begin(),
-                 hash_bins_end.end(),
-                 hash_bins_start.begin());
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
-          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
-          d_hash_data[offset] = e;
-        }
-      });
-
-    return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
-  }
-
-  unordered_multiset_device_view<Element, Hasher, Equality> to_device() const
-  {
-    return unordered_multiset_device_view<Element, Hasher, Equality>(
-      size, hash_bins.data(), hash_data.data());
-  }
-
- private:
-  unordered_multiset(size_type size,
-                     rmm::device_uvector<size_type>&& hash_bins,
-                     rmm::device_uvector<Element>&& hash_data)
-    : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
-  {
-  }
-
-  size_type size;
-  rmm::device_uvector<size_type> hash_bins;
-  rmm::device_uvector<Element> hash_data;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 4363bd212fe..85971647434 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -14,23 +14,14 @@
  * limitations under the License.
  */
 
-#include <hash/unordered_multiset.cuh>
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 namespace detail {
@@ -38,61 +29,7 @@ namespace detail {
 namespace {
 
 struct contains_column_dispatch {
-  template <typename Element, typename Haystack>
-  struct contains_fn {
-    bool __device__ operator()(size_type const idx) const
-    {
-      if (needles_have_nulls && needles.is_null_nocheck(idx)) {
-        // Exit early. The value doesn't matter, and will be masked as a null element.
-        return true;
-      }
-
-      return haystack.contains(needles.template element<Element>(idx));
-    }
-
-    Haystack const haystack;
-    column_device_view const needles;
-    bool const needles_have_nulls;
-  };
-
-  template <typename Element, CUDF_ENABLE_IF(!is_nested<Element>())>
-  std::unique_ptr<column> operator()(column_view const& haystack,
-                                     column_view const& needles,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    auto result = make_numeric_column(data_type{type_to_id<bool>()},
-                                      needles.size(),
-                                      copy_bitmask(needles, stream, mr),
-                                      needles.null_count(),
-                                      stream,
-                                      mr);
-    if (needles.is_empty()) { return result; }
-
-    auto const out_begin = result->mutable_view().template begin<bool>();
-    if (haystack.is_empty()) {
-      thrust::uninitialized_fill(
-        rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false);
-      return result;
-    }
-
-    auto const haystack_set = cudf::detail::unordered_multiset<Element>::create(haystack, stream);
-    auto const haystack_set_dv = haystack_set.to_device();
-    auto const needles_cdv_ptr = column_device_view::create(needles, stream);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(needles.size()),
-                      out_begin,
-                      contains_fn<Element, decltype(haystack_set_dv)>{
-                        haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()});
-
-    result->set_null_count(needles.null_count());
-
-    return result;
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(is_nested<Element>())>
+  template <typename Element>
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
@@ -144,8 +81,6 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch");
-
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
 }