From 985f671e1308c97de992887f3bccedced494fa44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:03:19 -0400
Subject: [PATCH 01/32] Fix slice_strings wide strings logic with multi-byte
 characters (#16777)

Fixes logic error in computing character and byte counts for slice positions in strings with specific pattern of multi-byte characters.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16777
---
 cpp/src/strings/slice.cu          |  8 +++++---
 cpp/tests/strings/slice_tests.cpp | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 978a844c476..4c39fc96397 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
       break;
     }
     size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
-    size_type const bc = (itr < end);
+    size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0;
     char_count += cg::reduce(warp, cc, cg::plus<int>());
     byte_count += cg::reduce(warp, bc, cg::plus<int>());
     itr += cudf::detail::warp_size;
   }
 
+  __syncwarp();
+
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {
       d_output[str_idx] = string_index_pair{"", 0};
       return;
     }
 
-    // we are just below start/stop and must now increment up to it from here
+    // we are just below start/stop and must now increment up to them from here
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
       first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
-    stop           = max(stop, char_count);
+    stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 52e439bd93f..7f7fd9d521b 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsSliceTest, MultiByteChars)
+{
+  auto input = cudf::test::strings_column_wrapper({
+    // clang-format off
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "the following code snippet demonstrates how to use search for values in an ordered range  "
+            // this placement tests proper multi-byte chars handling  ------vvvvv
+    "it returns the last position where value could be inserted without the ééééé ordering ",
+    "algorithms execution is parallelized as determined by an execution policy; this is a 12345"
+    "continuation of previous row to make sure string boundaries are honored 012345678901234567"
+           //   v--- this one also
+    "01234567890é34567890012345678901234567890"
+    // clang-format on
+  });
+
+  auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsSliceTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};

From 0b32f55b1ed38507437770d21da1e4e1a1c4a17d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:33:37 -0400
Subject: [PATCH 02/32] Fix nvbench output for sha512 (#16773)

Fixes the `sha512` output for nvbench for `GlobalMem BW`.
Previously:
```
|    65536 |     0 | sha512 |   1216x | 417.898 us |  1.40% | 412.669 us | 0.61% |     24.139 GB/s |           3.14% |
| 16777216 |     0 | sha512 |     11x |  71.392 ms |  0.03% |  71.387 ms | 0.03% | 258404.649 PB/s | 33642233417.78% |
|    65536 |   0.1 | sha512 |   1184x | 433.031 us |  1.58% | 427.815 us | 1.01% |     22.919 GB/s |           2.98% |
| 16777216 |   0.1 | sha512 |     11x |  73.457 ms |  0.03% |  73.452 ms | 0.03% | 251140.174 PB/s | 32696456458.71% |
```
Fixed integer overflow calculation:
```
|    65536 |     0 | sha512 |   1200x | 423.838 us |  1.42% | 418.561 us | 0.66% |  23.799 GB/s |  3.10% |
| 16777216 |     0 | sha512 |     11x |  72.773 ms |  0.11% |  72.767 ms | 0.11% |  35.041 GB/s |  4.56% |
|    65536 |   0.1 | sha512 |   1168x | 439.078 us |  1.60% | 433.843 us | 1.05% |  22.601 GB/s |  2.94% |
| 16777216 |   0.1 | sha512 |     19x |  75.108 ms |  0.49% |  75.102 ms | 0.49% |  33.412 GB/s |  4.35% |
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16773
---
 cpp/benchmarks/hashing/hash.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 61e79a47a50..e4ff0c8c4a7 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });

From e063baa7a447a8273c213c6fbef2ffc93a95ff99 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:14:26 -0700
Subject: [PATCH 03/32] Support reading multiple PQ sources with mismatching
 nullability for columns (#16639)

Related to #12702.

This PR adds support of reading multiple Parquet files with mismatched nullability for input columns. i.e. A column may not be nullable in one input file and nullable in another file.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16639
---
 cpp/src/io/parquet/page_decode.cuh           |   2 +-
 cpp/src/io/parquet/parquet.hpp               |   7 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |   7 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  18 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp   | 120 ++++++---
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  27 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 104 +++++---
 python/cudf/cudf/tests/test_parquet.py       | 254 ++++++++++++++++---
 8 files changed, 418 insertions(+), 121 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3f91f6859b..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  constexpr int batch_size = 32;
+  constexpr int batch_size = cudf::detail::warp_size;
   int cur_leaf_count       = target_leaf_count;
   while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 5d10472b0ae..7c985643887 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -203,10 +203,9 @@ struct SchemaElement {
   bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
-           type_length == other.type_length && repetition_type == other.repetition_type &&
-           name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
-           field_id == other.field_id;
+           type_length == other.type_length && name == other.name &&
+           num_children == other.num_children && decimal_scale == other.decimal_scale &&
+           decimal_precision == other.decimal_precision && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 125d35f6499..1390339c1ae 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -400,7 +400,8 @@ struct ColumnChunkDesc {
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
                            float list_bytes_per_row_est_,
-                           bool strings_to_categorical_)
+                           bool strings_to_categorical_,
+                           int32_t src_file_idx_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -419,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      src_file_idx(src_file_idx_)
 
   {
   }
@@ -456,6 +458,7 @@ struct ColumnChunkDesc {
 
   bool is_strings_to_cat{};    // convert strings to hashes
   bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
+  int32_t src_file_idx{};      // source file index
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 245e1829c72..c588fedb85c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info()
     std::transform(
       _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) {
         // translate schema_idx into something we can use for the page indexes
-        if (auto it = std::find_if(
-              columns.begin(),
-              columns.end(),
-              [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; });
+        if (auto it = std::find_if(columns.begin(),
+                                   columns.end(),
+                                   [&](auto const& col_chunk) {
+                                     return col_chunk.schema_idx ==
+                                            _metadata->map_schema_index(col.schema_idx,
+                                                                        rg.source_index);
+                                   });
             it != columns.end()) {
           return std::distance(columns.begin(), it);
         }
@@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info()
       auto col = _input_columns[i];
       // look up metadata
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
+      auto& schema   = _metadata->get_schema(
+        _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index);
 
       auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
@@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info()
                                        col.schema_idx,
                                        chunk_info,
                                        list_bytes_per_row_est,
-                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical,
+                                       rg.source_index));
     }
-
     // Adjust for skip_rows when updating the remaining rows after the first group
     remaining_rows -=
       (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 8b5678f202b..6d566b5815e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
   std::vector<column_chunk_info> chunks(rg.columns.size());
 
   for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) {
-    auto const& col_chunk    = rg.columns[col_idx];
-    auto& schema             = get_schema(col_chunk.schema_idx);
+    auto const& col_chunk = rg.columns[col_idx];
+    auto const is_schema_idx_mapped =
+      is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index);
+    auto const mapped_schema_idx = is_schema_idx_mapped
+                                     ? map_schema_index(col_chunk.schema_idx, rg_info.source_index)
+                                     : col_chunk.schema_idx;
+    auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0);
     auto const max_def_level = schema.max_definition_level;
     auto const max_rep_level = schema.max_repetition_level;
 
@@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata(
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  // Validate that all sources have the same schema unless we are reading select columns
-  // from mismatched sources, in which case, we will only check the projected columns later.
-  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
-    auto const& first_meta = per_file_metadata.front();
+  if (per_file_metadata.size() > 1) {
+    auto& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
-    auto const& schema = first_meta.schema;
-
-    // Verify that the input files have matching numbers of columns and schema.
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() > 0) {
-        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
-                     "All sources must have the same number of columns");
+    auto& schema = first_meta.schema;
+
+    // Validate that all sources have the same schema unless we are reading select columns
+    // from mismatched sources, in which case, we will only check the projected columns later.
+    if (not has_cols_from_mismatched_srcs) {
+      // Verify that the input files have matching numbers of columns and schema.
+      for (auto const& pfm : per_file_metadata) {
+        if (pfm.row_groups.size() > 0) {
+          CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                       "All sources must have the same number of columns");
+        }
+        CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
       }
-      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
+
+    // Mark the column schema in the first (default) source as nullable if it is nullable in any of
+    // the input sources. This avoids recomputing this within build_column() and
+    // populate_metadata().
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(1)),
+      thrust::make_counting_iterator(schema.size()),
+      [&](auto const schema_idx) {
+        if (schema[schema_idx].repetition_type == REQUIRED and
+            std::any_of(
+              per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) {
+                return pfm.schema[schema_idx].repetition_type != REQUIRED;
+              })) {
+          schema[schema_idx].repetition_type = OPTIONAL;
+        }
+      });
   }
 
   // Collect and apply arrow:schema from Parquet's key value metadata section
@@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
-  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
-  // mismatched Parquet sources.
-  if (src_idx and not schema_idx_maps.empty()) {
-    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
-    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
-                 "Unmapped schema index encountered in the specified source tree",
-                 std::range_error);
-    schema_idx = schema_idx_map.at(schema_idx);
-  }
+  // Map schema index to the provided source file index
+  schema_idx = map_schema_index(schema_idx, src_idx);
 
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const
   return rg_metadata;
 }
 
+bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; }
+
+  // Check if mapped
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  return schema_idx_map.find(schema_idx) != schema_idx_map.end();
+}
+
+int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; }
+
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+               "Unmapped schema index encountered in the specified source tree",
+               std::out_of_range);
+
+  // Return the mapped schema idx.
+  return schema_idx_map.at(schema_idx);
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns(
   // Compares two schema elements to be equal except their number of children
   auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
     return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
-           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
-           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.type_length == rhs.type_length and lhs.name == rhs.name and
+           lhs.decimal_scale == rhs.decimal_scale and
            lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
   };
 
@@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns(
                    "the selected path",
                    std::invalid_argument);
 
+      // Get the schema_idx_map for this data source (pfm)
+      auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+      // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+      schema_idx_map[src_schema_idx] = dst_schema_idx;
+
       // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
       // hierarchy. So continue on with mapping.
       if (src_schema_elem.is_stub()) {
@@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns(
                        pfm_idx);
           });
       }
-
-      // We're at a leaf and this is an input column (one with actual data stored) so map it.
-      if (src_schema_elem.num_children == 0) {
-        // Get the schema_idx_map for this data source (pfm)
-        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
-
-        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
-        schema_idx_map[src_schema_idx] = dst_schema_idx;
-      }
     };
 
   std::vector<int> output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6f2863136b2..6487c92f48f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -234,6 +234,26 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
+  /**
+   * @brief Checks if a schema index from 0th source is mapped to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to check mappings for.
+   *
+   * @return True if schema index is mapped
+   */
+  [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const;
+
+  /**
+   * @brief Maps schema index from 0th source file to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to.
+   *
+   * @return Mapped schema index
+   */
+  [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const;
+
   /**
    * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
    *
@@ -248,7 +268,7 @@ class aggregate_reader_metadata {
     CUDF_EXPECTS(
       schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
       "Parquet reader encountered an invalid schema_idx or pfm_idx",
-      std::invalid_argument);
+      std::out_of_range);
     return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
@@ -256,7 +276,10 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
-   * @brief Gets the concrete nesting depth of output cudf columns
+   * @brief Gets the concrete nesting depth of output cudf columns.
+   *
+   * Gets the nesting depth of the output cudf column for the given schema.
+   * The nesting depth must be equal for the given schema_index across all sources.
    *
    * @param schema_index Schema index of the input column
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 52918f5bc80..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
  * is indicated when adding new values.  This function generates the mappings of
  * the R/D levels to those start/end bounds
  *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
+ * @param remap Maps column schema index to the R/D remapping vectors for that column for a
+ *              particular input source file
+ * @param src_col_schema The source column schema to generate the new mapping for
+ * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file
+ * @param src_file_idx The input source file index for the column schema
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
+void generate_depth_remappings(
+  std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>& remap,
+  int const src_col_schema,
+  int const mapped_src_col_schema,
+  int const src_file_idx,
+  aggregate_reader_metadata const& md)
 {
   // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
+  if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; }
+  auto const& schema   = md.get_schema(mapped_src_col_schema, src_file_idx);
+  auto const max_depth = md.get_output_nesting_depth(src_col_schema);
 
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+  CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(),
                "Attempting to remap a schema more than once");
   auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+    remap.insert(std::pair<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>{
+      {src_col_schema, src_file_idx}, {}});
   auto& depth_remap = inserted.first->second;
 
   std::vector<int>& rep_depth_remap = (depth_remap.first);
@@ -136,15 +143,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
       int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
+        auto& cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r) {
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
         // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) {
+        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) {
           shallowest = cur_depth - 1;
         }
         if (!cur_schema.is_stub()) { cur_depth--; }
@@ -159,10 +166,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
     auto find_deepest = [&](int d) {
       SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       int r1         = 0;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_definition_level == d) {
           // if this is a repeated field, map it one level deeper
           r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
@@ -175,10 +182,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
       // we now know R1 from above. return the deepest nesting level that has the
       // same repetition level
-      schema_idx = src_col_schema;
+      schema_idx = mapped_src_col_schema;
       int depth  = max_depth - 1;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r1) {
           // if this is a repeated field, map it one level deeper
           depth = cur_schema.is_stub() ? depth + 1 : depth;
@@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info()
   std::vector<int> per_page_nesting_info_size(num_columns);
   auto iter = thrust::make_counting_iterator(size_type{0});
   std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    // Schema index of the current input column
     auto const schema_idx = _input_columns[i].schema_idx;
-    auto const& schema    = _metadata->get_schema(schema_idx);
-    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+    // Get the max_definition_level of this column across all sources.
+    auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1;
+    std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                  thrust::make_counting_iterator(_sources.size()),
+                  [&](auto const src_file_idx) {
+                    auto const& schema = _metadata->get_schema(
+                      _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+                    max_definition_level =
+                      std::max(max_definition_level, schema.max_definition_level + 1);
+                  });
+
+    return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx));
   });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
@@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info()
         page_nesting_decode_info.device_ptr() + src_info_index;
 
       pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
+      // Set the number of output nesting levels from the zeroth source as nesting must be
+      // identical across sources.
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // Reset the target_page_index
+  target_page_index = 0;
+
   // fill in
   int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const src_col_schema = _input_columns[idx].schema_idx;
 
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    // nesting depth must be same across sources so getting it from the zeroth source is ok
     int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
+    // Map to store depths if this column has lists
+    std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
     // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(0)),
+      thrust::make_counting_iterator(_sources.size()),
+      [&](auto const src_file_idx) {
+        auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx);
+        if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) {
+          generate_depth_remappings(
+            depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata);
+        }
+      });
 
     // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
+    int schema_idx = src_col_schema;
+    // This is okay as we only use this to check stubness of cur_schema and
+    // to get its parent's indices, both of which are one to one mapped.
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
@@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+          // Source file index for the current page.
+          auto const src_file_idx =
+            pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx;
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
@@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info()
             &page_nesting_decode_info[nesting_info_index +
                                       (p_idx * per_page_nesting_info_size[idx])];
 
+          auto const mapped_src_col_schema =
+            _metadata->map_schema_index(src_col_schema, src_file_idx);
           // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
+          if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) {
+            auto remap = depth_remapping.find({src_col_schema, src_file_idx});
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
             std::vector<int> const& rep_depth_remap = (remap->second.first);
@@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info()
             }
           }
 
+          // Get the schema from the current input source.
+          auto& actual_cur_schema = _metadata->get_schema(
+            _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+
           // values indexed by output column index
-          nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
+          nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
+            to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
+    // Offset the page and nesting info indices
+    target_page_index += subpass.column_page_count[idx];
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b59a7eef08..7f1b0b1cd46 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     df1 = cudf.DataFrame(
         {
             "i32": cudf.Series([None, None, None], dtype="int32"),
-            "i64": cudf.Series([1234, None, 123], dtype="int64"),
-            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "i64": cudf.Series([1234, 467, 123], dtype="int64"),
+            "list": list([[1, 2], None, [None, 6]]),
             "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "str": ["vfd", None, "ghu"],
             "d_list": list(
@@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
 
     df2 = cudf.DataFrame(
         {
-            "str": ["abc", "def", None],
+            "str": ["abc", "def", "ghi"],
             "i64": cudf.Series([None, 65, 98], dtype="int64"),
             "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
-            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "list": list([[7, 8], [9, 10], [11, 12]]),
             "d_list": list(
                 [
                     [pd.Timedelta(minutes=4), None],
-                    [None, None],
+                    None,
                     [pd.Timedelta(minutes=6), None],
                 ]
             ),
@@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs():
         {
             "a": 1,
             "b": {
-                "inner_a": 10,
-                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+                "a_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 2},
             },
             "c": 2,
         },
         {
             "a": 3,
-            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "b": {"b_a": 30, "b_b": {"b_b_a": 210}},
             "c": 4,
         },
-        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6},
         {"a": 7, "b": None, "c": 8},
-        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
-        None,
-        {
-            "a": None,
-            "b": {
-                "inner_a": None,
-                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
-            },
-            "c": 10,
-        },
+        {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None},
     ]
 
     data2 = [
-        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
-        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
-        {"a": 5, "b": {"inner_b": None}},
-        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
-        {"a": None, "b": {"inner_b": None}},
+        {"a": 1, "b": {"b_b": {"b_b_a": None}}},
+        {"a": 5, "b": {"b_b": None}},
+        {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}},
+        {"a": None, "b": {"b_b": None}},
         None,
-        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
     ]
 
     # cuDF tables from struct data
@@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs():
     # Read the struct.b.inner_b.inner_inner_a column from parquet
     got = cudf.read_parquet(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         allow_mismatched_pq_schemas=True,
     )
     got = (
         cudf.Series(got["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Read with chunked reader
     got_chunked = read_parquet_chunked(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         chunk_read_limit=240,
         pass_read_limit=240,
         allow_mismatched_pq_schemas=True,
@@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs():
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Construct the expected series
@@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs():
         [
             cudf.Series(df1["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
             cudf.Series(df2["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
         ]
     ).reset_index(drop=True)
 
@@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
     data1 = [
-        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
-        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+        {"a": 1, "b": {"b_a": 1, "b_b": 6}},
+        {"a": 3, "b": {"b_a": None, "b_b": 2}},
     ]
     data2 = [
-        {"b": {"inner_a": 1}, "c": "str"},
-        {"b": {"inner_a": None}, "c": None},
+        {"b": {"b_a": 1}, "c": "str"},
+        {"b": {"b_a": None}, "c": None},
     ]
 
     # cuDF tables from struct data
@@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error():
     ):
         cudf.read_parquet(
             [buf1, buf2],
-            columns=["struct.b.inner_b"],
+            columns=["struct.b.b_b"],
             allow_mismatched_pq_schemas=True,
         )
+
+
+def test_parquet_reader_mismatched_nullability():
+    # Ensure that we can faithfully read the tables with mismatched nullabilities
+    df1 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            None,
+                            [pd.Timedelta(minutes=8), None],
+                        ],
+                        None,
+                    ],
+                    None,
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, None, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, 123, 4123], dtype="int32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "string": cudf.Series(["kitten", "puppy", "cub"]),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series(
+                [None, None, None], dtype="timedelta64[ms]"
+            ),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)],
+                        ],
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, None, 4123], dtype="int32"),
+            "list": list([[1, 2], None, [1, 2]]),
+            "datetime": cudf.Series(
+                [1234, None, 4123], dtype="datetime64[ms]"
+            ),
+            "string": cudf.Series(["kitten", None, "cub"]),
+        }
+    )
+
+    # Write tables to parquet with arrow schema for compatibility for duration column(s)
+    fname1 = BytesIO()
+    df1.to_parquet(fname1, store_schema=True)
+    fname2 = BytesIO()
+    df2.to_parquet(fname2, store_schema=True)
+
+    # Read tables back with cudf and arrow in either order and compare
+    assert_eq(
+        cudf.read_parquet([fname1, fname2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([fname2, fname1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
+
+
+def test_parquet_reader_mismatched_nullability_structs(tmpdir):
+    data1 = [
+        {
+            "a": "a",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 12},
+            },
+            "c": [1, 2],
+        },
+        {
+            "a": "b",
+            "b": {
+                "b_a": 30,
+                "b_b": {"b_b_b": 2, "b_b_a": 2},
+            },
+            "c": [3, 4],
+        },
+        {
+            "a": "c",
+            "b": {
+                "b_a": 50,
+                "b_b": {"b_b_b": 4, "b_b_a": 5},
+            },
+            "c": [5, 6],
+        },
+        {
+            "a": "d",
+            "b": {
+                "b_a": 135,
+                "b_b": {"b_b_b": 12, "b_b_a": 32},
+            },
+            "c": [7, 8],
+        },
+        {
+            "a": "e",
+            "b": {
+                "b_a": 1,
+                "b_b": {"b_b_b": 1, "b_b_a": 5},
+            },
+            "c": [9, 10],
+        },
+        {
+            "a": "f",
+            "b": {
+                "b_a": 32,
+                "b_b": {"b_b_b": 1, "b_b_a": 6},
+            },
+            "c": [11, 12],
+        },
+    ]
+
+    data2 = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ]
+
+    pa_table1 = pa.Table.from_pydict({"struct": data1})
+    df1 = cudf.DataFrame.from_arrow(pa_table1)
+
+    pa_table2 = pa.Table.from_pydict({"struct": data2})
+    df2 = cudf.DataFrame.from_arrow(pa_table2)
+
+    # Write tables to parquet
+    buf1 = BytesIO()
+    df1.to_parquet(buf1)
+    buf2 = BytesIO()
+    df2.to_parquet(buf2)
+
+    # Read tables back with cudf and compare with expected.
+    assert_eq(
+        cudf.read_parquet([buf1, buf2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([buf2, buf1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )

From 1b402dfc2f078656bcbbb8a0386008601620e6e2 Mon Sep 17 00:00:00 2001
From: Mike McCarty <mmccarty@nvidia.com>
Date: Wed, 11 Sep 2024 19:00:45 -0400
Subject: [PATCH 04/32] Recommending `miniforge` for conda install (#16782)

Recommending miniforge for conda install in README

Authors:
  - Mike McCarty (https://github.com/mmccarty)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16782
---
 README.md                  | 2 +-
 python/custreamz/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f62f7885d63..8f8c2adac2f 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 1509dac9e61..8da17ef09dc 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel:
+cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 Release:
 ```bash

From 3dbc33a5cb1cf7052cd67f5654b34594403fbfef Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Wed, 11 Sep 2024 19:11:20 -0700
Subject: [PATCH 05/32] Revert "Fix empty cluster handling in tdigest merge
 (#16675)" (#16800)

This PR reverts #16675, which has introduced another bug. Our nightly CI pipeline is failing because of this bug (https://github.com/NVIDIA/spark-rapids/issues/11463). I can reproduce the bug within a libcudf unit test. I will make another PR to fix both the original issue and the new bug.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16800
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 17 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 23 +++--
 .../quantiles/tdigest/tdigest_aggregation.cu  | 70 ++++++---------
 cpp/tests/groupby/tdigest_tests.cu            | 90 ++-----------------
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 6 files changed, 62 insertions(+), 162 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 672b95e2d01..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a tdigest column of empty clusters.
+ * @brief Create an empty tdigest column.
  *
- * The column created contains the specified number of rows of empty clusters.
+ * An empty tdigest column contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A tdigest column of empty clusters.
+ * @returns An empty tdigest column.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a scalar of an empty tdigest cluster.
+ * @brief Create an empty tdigest scalar.
  *
- * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
+ * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A scalar of an empty tdigest cluster.
+ * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index be7d19b2227..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 76cd55bf994..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,33 +292,32 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(num_rows,
-                             cudf::make_empty_column(type_id::FLOAT64),
-                             cudf::make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(1,
+                             make_empty_column(type_id::FLOAT64),
+                             make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -339,7 +338,7 @@ std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
+  auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d591fb5c171..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
- * set to false when there is no empty cluster, true otherwise.
+ * @param has_nulls Whether or not the input contains nulls
+ *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool may_have_empty_clusters)
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // If the input contains empty clusters, we can potentially have a group that also generates
-    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
-    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
-    // reduction value for that group of nulls and empty clusters. These "stubs" will be
-    // postprocessed out afterwards.
-    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
+    // if the input contains nulls we can potentially have a group that generates no
+    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
+    // in the tdigest generation step will need a location to store the unused reduction value for
+    // that group of nulls. these "stubs" will be postprocessed out afterwards.
+    if (has_nulls) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -518,7 +516,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool may_have_empty_clusters,
+                            bool has_nulls,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -537,7 +535,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    may_have_empty_clusters);
+    has_nulls);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -569,7 +567,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    may_have_empty_clusters);
+    has_nulls);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -582,7 +580,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool may_have_empty_clusters,
+                                            bool has_nulls,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -597,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!may_have_empty_clusters) { return 0; }
+    if (!has_nulls) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -663,10 +661,6 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
-/**
- * @brief A functor which returns the cluster index within a group that the value at
- * the given value index falls into.
- */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param centroids_begin    Beginning of the range of centroids.
- * @param centroids_end      End of the range of centroids.
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -738,7 +731,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool may_have_empty_clusters,
+                                         bool has_nulls,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -757,9 +750,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -802,7 +793,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             may_have_empty_clusters,
+                             has_nulls,
                              stream,
                              mr);
 }
@@ -1154,13 +1145,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
-  auto merged_weights = merged->get_column(1).view();
-  // If there are no values, we can simply return a column that has only empty tdigests.
-  if (merged_weights.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
-  }
-
   // generate cumulative weights
+  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1175,10 +1161,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
-  // We do not know whether there is any empty cluster in the input without actually reading the
-  // data, which could be expensive. So, we just assume that there could be empty clusters.
-  auto const may_have_empty_clusters = true;
-
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1195,7 +1177,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 
@@ -1220,7 +1202,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 }
@@ -1285,9 +1267,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1313,7 +1293,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 3780dbb1d95..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
-
-std::unique_ptr<cudf::table> do_agg(
-  cudf::column_view key,
-  cudf::column_view val,
-  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
-{
-  std::vector<cudf::column_view> keys;
-  keys.push_back(key);
-  cudf::table_view const key_table(keys);
-
-  cudf::groupby::groupby gb(key_table);
-  std::vector<cudf::groupby::aggregation_request> requests;
-  cudf::groupby::aggregation_request req;
-  req.values = val;
-  req.aggregations.push_back(make_agg());
-  requests.push_back(std::move(req));
-
-  auto result = gb.aggregate(std::move(requests));
-
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (auto&& c : result.first->release()) {
-    result_columns.push_back(std::move(c));
-  }
-
-  EXPECT_EQ(result.second.size(), 1);
-  EXPECT_EQ(result.second[0].results.size(), 1);
-  result_columns.push_back(std::move(result.second[0].results[0]));
-
-  return std::make_unique<cudf::table>(std::move(result_columns));
-}
-
-TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
-{
-  // The input must be sorted by the key.
-  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
-  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
-  auto const keys_view = cudf::column_view(keys);
-  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    // All values are null
-    return false;
-  });
-  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
-    val_elems, val_elems + keys_view.size(), val_valids};
-
-  auto const delta = 10000;
-
-  // Compute tdigest. The result should have 3 empty clusters, one per group.
-  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
-    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-  });
-
-  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
-  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_computed_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
-  // The computed values are nullable even though the input values are not.
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
-                                 compute_result->get_column(1).view());
-
-  // Merge tdigest. The result should have 3 empty clusters, one per group.
-  auto const merge_result =
-    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
-      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-    });
-
-  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
-  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_merged_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
-}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 7359f0406fc..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From 124d3e353eeebd595da113dbef3d5bad842a791d Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 12:17:58 -0500
Subject: [PATCH 06/32] Migrate dask-cudf README improvements to dask-cudf
 sphinx docs (#16765)

Follow up to https://github.com/rapidsai/cudf/pull/16671

- Moves most of the information recently added to the dask-cudf README into the dask-cudf Sphinx documentation
- Adds a "Quick-start" example to the simplified dask-cudf README

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/16765
---
 docs/cudf/source/user_guide/10min.ipynb |  31 ++--
 docs/dask_cudf/source/index.rst         | 210 ++++++++++++++++++------
 python/dask_cudf/README.md              | 148 +++++------------
 3 files changed, 213 insertions(+), 176 deletions(-)

diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 2eaa75b3189..95f5f9734dd 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,9 +5,9 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and Dask-cuDF\n",
+    "# 10 Minutes to cuDF and Dask cuDF\n",
     "\n",
-    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
+    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n",
     "\n",
     "## What are these Libraries?\n",
     "\n",
@@ -18,13 +18,14 @@
     "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "> [!NOTE]  \n",
-    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note:</b> This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's <a href=\"https://docs.dask.org/en/latest/configuration.html\">configuration infrastructure</a> to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the <a href=\"https://github.com/rapidsai/cudf/tree/main/python/dask_cudf\">Dask cuDF documentation</a> for more information.\n",
+    "</div>\n",
     "\n",
     "\n",
-    "## When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask cuDF\n",
     "\n",
-    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
+    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF."
    ]
   },
   {
@@ -115,7 +116,7 @@
    "source": [
     "ds = dask_cudf.from_cudf(s, npartitions=2)\n",
     "# Note the call to head here to show the first few entries, unlike\n",
-    "# cuDF objects, dask-cuDF objects do not have a printing\n",
+    "# cuDF objects, Dask-cuDF objects do not have a printing\n",
     "# representation that shows values since they may not be in local\n",
     "# memory.\n",
     "ds.head(n=3)"
@@ -331,11 +332,11 @@
    "id": "b17db919",
    "metadata": {},
    "source": [
-    "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
+    "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
     "\n",
-    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
+    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
     "\n",
-    "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
+    "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
    ]
   },
   {
@@ -1680,7 +1681,7 @@
    "id": "7aa0089f",
    "metadata": {},
    "source": [
-    "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
+    "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
    ]
   },
   {
@@ -2393,7 +2394,7 @@
    "id": "f6094cbe",
    "metadata": {},
    "source": [
-    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
+    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
    ]
   },
   {
@@ -3492,7 +3493,7 @@
    "id": "5ac3b004",
    "metadata": {},
    "source": [
-    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
+    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF."
    ]
   },
   {
@@ -4181,7 +4182,7 @@
    "id": "aa8a445b",
    "metadata": {},
    "source": [
-    "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
+    "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
    ]
   },
   {
@@ -4899,7 +4900,7 @@
    "id": "787eae14",
    "metadata": {},
    "source": [
-    "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
+    "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
    ]
   },
   {
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 9a216690384..7fe6cbd45fa 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -3,39 +3,42 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to dask-cudf's documentation!
+Welcome to Dask cuDF's documentation!
 =====================================
 
-**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension
+**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension
 library for the `Dask <https://dask.org>`__ parallel computing
-framework that provides a `cuDF
-<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
-dataframe with the same API as `Dask dataframes
-<https://docs.dask.org/en/stable/dataframe.html>`__.
+framework. When installed, Dask cuDF is automatically registered
+as the ``"cudf"`` dataframe backend for
+`Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__.
+
+.. note::
+  Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
+  or multi-node execution on their own. You must also deploy a
+  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
+  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
+  setup of the cluster, taking advantage of all features of the GPU
+  and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
-`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
 should feel familiar to you. If not, we recommend starting with `10
 minutes to Dask
 <https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
-by `10 minutes to cuDF and Dask-cuDF
+by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
-When running on multi-GPU systems, `Dask-CUDA
-<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
-simplify the setup of the cluster, taking advantage of all features of
-the GPU and networking hardware.
 
-Using Dask-cuDF
+Using Dask cuDF
 ---------------
 
-When installed, Dask-cuDF registers itself as a dataframe backend for
-Dask. This means that in many cases, using cuDF-backed dataframes requires
-only small changes to an existing workflow. The minimal change is to
-select cuDF as the dataframe backend in :doc:`Dask's
-configuration <dask:configuration>`. To do so, we must set the option
-``dataframe.backend`` to ``cudf``. From Python, this can be achieved
-like so::
+The Dask DataFrame API (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simply use the `Dask configuration <dask:configuration>` system to
+set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
+this can be achieved like so::
 
   import dask
 
@@ -44,52 +47,157 @@ like so::
 Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
 environment before running your code.
 
-Dataframe creation from on-disk formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your workflow creates Dask dataframes from on-disk formats
-(for example using :func:`dask.dataframe.read_parquet`), then setting
-the backend may well be enough to migrate your workflow.
-
-For example, consider reading a dataframe from parquet::
+Once this is done, the public Dask DataFrame API will leverage
+``cudf`` automatically when a new DataFrame collection is created
+from an on-disk format using any of the following ``dask.dataframe``
+functions::
 
-   import dask.dataframe as dd
+* :func:`dask.dataframe.read_parquet`
+* :func:`dask.dataframe.read_json`
+* :func:`dask.dataframe.read_csv`
+* :func:`dask.dataframe.read_orc`
+* :func:`dask.dataframe.read_hdf`
+* :func:`dask.dataframe.from_dict`
 
-   # By default, we obtain a pandas-backed dataframe
-   df = dd.read_parquet("data.parquet", ...)
+For example::
 
+  import dask.dataframe as dd
 
-To obtain a cuDF-backed dataframe, we must set the
-``dataframe.backend`` configuration option::
+  # By default, we obtain a pandas-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
 
   import dask
-  import dask.dataframe as dd
 
   dask.config.set({"dataframe.backend": "cudf"})
-  # This gives us a cuDF-backed dataframe
+  # This now gives us a cuDF-backed dataframe
   df = dd.read_parquet("data.parquet", ...)
 
-This code will use cuDF's GPU-accelerated :func:`parquet reader
-<cudf.read_parquet>` to read partitions of the data.
+When other functions are used to create a new collection
+(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`,
+and :func:`from_array`), the backend of the new collection will
+depend on the inputs to those functions. For example::
+
+  import pandas as pd
+  import cudf
+
+  # This gives us a pandas-backed dataframe
+  dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+  # This gives us a cuDF-backed dataframe
+  dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+
+An existing collection can always be moved to a specific backend
+using the :func:`dask.dataframe.DataFrame.to_backend` API::
+
+  # This ensures that we have a cuDF-backed dataframe
+  df = df.to_backend("cudf")
+
+  # This ensures that we have a pandas-backed dataframe
+  df = df.to_backend("pandas")
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to providing the ``"cudf"`` backend for Dask DataFrame,
+Dask cuDF also provides an explicit ``dask_cudf`` API::
+
+  import dask_cudf
+
+  # This always gives us a cuDF-backed dataframe
+  df = dask_cudf.read_parquet("data.parquet", ...)
+
+This API is used implicitly by the Dask DataFrame API when the ``"cudf"``
+backend is enabled. Therefore, using it directly will not provide any
+performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
+Also, using some parts of the explicit API are incompatible with
+automatic query planning (see the next section).
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
+As long as the ``"dataframe.query-planning"`` configuration is set to
+``True`` (the default) when ``dask.dataframe`` is first imported, `Dask
+Expressions <https://github.com/dask/dask-expr>`__ will be used under the hood.
+
+For example, the following code will automatically benefit from predicate
+pushdown when the result is computed::
+
+  df = dd.read_parquet("/my/parquet/dataset/")
+  result = df.sort_values('B')['A']
+
+Unoptimized expression graph (``df.pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+
+Simplified expression graph (``df.simplify().pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+
+.. note::
+  Dask will automatically simplify the expression graph (within
+  :func:`optimize`) when the result is converted to a task graph
+  (via :func:`compute` or :func:`persist`). You do not need to call
+  :func:`simplify` yourself.
+
+
+Using Multiple GPUs and Multiple Nodes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try
+to partition your data into small-enough tasks to fit comfortably in the
+memory of a single GPU. This means the necessary compute tasks needed to
+compute a query can often be streamed to a single GPU process for
+out-of-core computing. This also means that the compute tasks can be
+executed in parallel over a multi-GPU cluster.
+
+In order to execute your Dask workflow on multiple GPUs, you will
+typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+to deploy distributed Dask cluster, and
+`Distributed <https://distributed.dask.org/en/stable/client.html>`__
+to define a client object. For example::
+
+  from dask_cuda import LocalCUDACluster
+  from distributed import Client
+
+  if __name__ == "__main__":
+
+    client = Client(
+      LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      )
+    )
+
+    df = dd.read_parquet("/my/parquet/dataset/")
+    agg = df.groupby('B').sum()
+    agg.compute()  # This will use the cluster defined above
+
+.. note::
+  This example uses :func:`compute` to materialize a concrete
+  ``cudf.DataFrame`` object in local memory. Never call :func:`compute`
+  on a large collection that cannot fit comfortably in the memory of a
+  single GPU! See Dask's `documentation on managing computation
+  <https://distributed.dask.org/en/stable/manage-computation.html>`__
+  for more details.
 
-Dataframe creation from in-memory formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+documentation for more information about deploying GPU-aware clusters
+(including `best practices
+<https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).
 
-If you already have a dataframe in memory and want to convert it to a
-cuDF-backend one, there are two options depending on whether the
-dataframe is already a Dask one or not. If you have a Dask dataframe,
-then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
-as the backend; if you have a pandas dataframe then you can either
-call :func:`dask.dataframe.from_pandas` followed by
-:func:`~dask.dataframe.to_backend` or first convert the dataframe with
-:func:`cudf.from_pandas` and then parallelise this with
-:func:`dask_cudf.from_cudf`.
 
 API Reference
 -------------
 
-Generally speaking, Dask-cuDF tries to offer exactly the same API as
-Dask itself. There are, however, some minor differences mostly because
+Generally speaking, Dask cuDF tries to offer exactly the same API as
+Dask DataFrame. There are, however, some minor differences mostly because
 cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
 the pandas API, or because cuDF provides additional configuration
 flags (these mostly occur in data reading and writing interfaces).
@@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces).
 As a result, straightforward workflows can be migrated without too
 much trouble, but more complex ones that utilise more features may
 need a bit of tweaking. The API documentation describes details of the
-differences and all functionality that Dask-cuDF supports.
+differences and all functionality that Dask cuDF supports.
 
 .. toctree::
    :maxdepth: 2
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 6edb9f87d48..4655d2165f0 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -1,135 +1,63 @@
 # <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
 
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
-
-## Using Dask cuDF
-
-### The Dask DataFrame API (Recommended)
-
-Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
-
-```python
-import dask
-dask.config.set({"dataframe.backend": "cudf"})
-
-import dask.dataframe as dd
-# This gives us a cuDF-backed dataframe
-df = dd.read_parquet("data.parquet", ...)
-```
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
 
 > [!IMPORTANT]
-> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
-
-```python
-import pandas as pd
-import cudf
-
-# This gives us a Pandas-backed dataframe
-dd.from_pandas(pd.DataFrame({"a": range(10)}))
-
-# This gives us a cuDF-backed dataframe
-dd.from_pandas(cudf.DataFrame({"a": range(10)}))
-```
-
-A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
-
-```python
-df = df.to_backend("pandas")
-```
-
-Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
-
-```python
-df = df.to_backend("cudf")
-```
-
-### The Explicit Dask cuDF API
-
-In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
-
-```python
-import dask_cudf
-
-# This always gives us a cuDF-backed dataframe
-df = dask_cudf.read_parquet("data.parquet", ...)
-```
-
-> [!NOTE]
-> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently.
 
-See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
-
-## Query Planning
-
-Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
-
-For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
-
-```python
-df = dd.read_parquet("/my/parquet/dataset/")
-result = df.sort_values('B')['A']
-```
-
-Unoptimized expression graph (`df.pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
-```
+## Using Dask cuDF
 
-Simplified expression graph (`df.simplify().pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
-```
+Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF.
 
-> [!NOTE]
-> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+## Installation
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
 
-## Using Multiple GPUs and Multiple Nodes
+## Resources
 
-Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
-> [!IMPORTANT]
-> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+### Quick-start example
 
-In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern:
 
 ```python
-
+import dask
+import dask.dataframe as dd
 from dask_cuda import LocalCUDACluster
 from distributed import Client
 
-client = Client(
+if __name__ == "__main__":
+
+  # Define a GPU-aware cluster to leverage multiple GPUs
+  client = Client(
     LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
-        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
-        enable_cudf_spill=True,  # Improve device memory stability
-        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+      rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+      enable_cudf_spill=True,  # Improve device memory stability
+      local_directory="/fast/scratch/",  # Use fast local storage for spilling
     )
-)
+  )
 
-df = dd.read_parquet("/my/parquet/dataset/")
-agg = df.groupby('B').sum()
-agg.compute()  # This will use the cluster defined above
-```
+  # Set the default dataframe backend to "cudf"
+  dask.config.set({"dataframe.backend": "cudf"})
 
-> [!NOTE]
-> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+  # Create your DataFrame collection from on-disk
+  # or in-memory data
+  df = dd.read_parquet("/my/parquet/dataset/")
 
-Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+  # Use cudf-like syntax to transform and/or query your data
+  query = df.groupby('item')['price'].mean()
 
-## Install
-
-See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+  # Compute, persist, or write out the result
+  query.head()
+```
 
-## Resources
+If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory).
 
-- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
-- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
-- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
-- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
-- [Deployment](https://docs.rapids.ai/deployment/stable/)
-- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
+If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions.

From 40333854b5efadb5b482ec80663b837680af1598 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 16 Sep 2024 17:04:47 -0500
Subject: [PATCH 07/32] Java: Make
 ColumnVector.fromViewWithContiguousAllocation public (#16784)

Exposes ColumnVector's fromViewWithContiguousAllocation method so code outside of cudf that builds contiguous table views can expose those columns in Java.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16784
---
 java/src/main/java/ai/rapids/cudf/ColumnVector.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 5a0fbd224ad..6a0f0f6f169 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount,
         od, vd, nullCount, numRows, childHandles);
   }
 
-  static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
+  /**
+   * Creates a ColumnVector from a native column_view using a contiguous device allocation.
+   *
+   * @param columnViewAddress address of the native column_view
+   * @param buffer device buffer containing the data referenced by the column view
+   */
+  public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
     return new ColumnVector(columnViewAddress, buffer);
   }
 

From 86861e08d9f7b1ae0a61d6b05bbfc6690107ca0f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 19:14:18 -0500
Subject: [PATCH 08/32] Fix `cov`/`corr` bug in dask-cudf (#16786)

Closes https://github.com/rapidsai/cudf/issues/14935

Overrides `_prepare_cov_corr` method to avoid cudf compatibility issues in dask-cudf.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16786
---
 python/dask_cudf/dask_cudf/expr/_collection.py | 18 +++++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py  | 17 +++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index f60e4ff81ef..97e1dffc65b 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs):
 
         return self.to_backend("pandas", **kwargs)
 
+    def _prepare_cov_corr(self, min_periods, numeric_only):
+        # Upstream version of this method sets min_periods
+        # to 2 by default (which is not supported by cudf)
+        # TODO: Remove when cudf supports both min_periods
+        # and numeric_only
+        # See: https://github.com/rapidsai/cudf/issues/12626
+        # See: https://github.com/rapidsai/cudf/issues/9009
+        self._meta.cov(min_periods=min_periods)
+
+        frame = self
+        if numeric_only:
+            numerics = self._meta._get_numeric_data()
+            if len(numerics.columns) != len(self.columns):
+                frame = frame[list(numerics.columns)]
+        return frame, min_periods
+
     # var can be removed if cudf#15179 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15179
+    # See: https://github.com/rapidsai/cudf/issues/14935
     def var(
         self,
         axis=0,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 905d8c08135..7aa0f6320f2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1007,3 +1007,20 @@ def test_to_backend_simplify():
         df2 = df.to_backend("cudf")[["y"]].simplify()
         df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
         assert df2._name == df3._name
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("op", ["corr", "cov"])
+def test_cov_corr(op, numeric_only):
+    df = cudf.DataFrame.from_dict(
+        {
+            "x": np.random.randint(0, 5, size=10),
+            "y": np.random.normal(size=10),
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=2)
+    res = getattr(ddf, op)(numeric_only=numeric_only)
+    # Use to_pandas until cudf supports numeric_only
+    # (See: https://github.com/rapidsai/cudf/issues/12626)
+    expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
+    dd.assert_eq(res, expect)

From f8d50639fffb541dee3b860c19756af2c4a5a850 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Mon, 16 Sep 2024 21:27:38 -0400
Subject: [PATCH 09/32] Add ability to set parquet row group max #rows and
 #bytes in java (#16805)

Adds the ability to set the max # rows per row group and max # bytes per row group for parquet files.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16805
---
 .../ai/rapids/cudf/ParquetWriterOptions.java  | 26 ++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 68 +++++++++++--------
 java/src/main/native/src/TableJni.cpp         |  8 +++
 .../test/java/ai/rapids/cudf/TableTest.java   |  8 ++-
 4 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 7b58817550d..8c8180436e6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,9 +24,13 @@
  */
 public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
+  private int rowGroupSizeRows;
+  private long rowGroupSizeBytes;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
+    this.rowGroupSizeRows = builder.rowGroupSizeRows;
+    this.rowGroupSizeBytes = builder.rowGroupSizeBytes;
     this.statsGranularity = builder.statsGranularity;
   }
 
@@ -51,18 +55,38 @@ public static Builder builder() {
     return new Builder();
   }
 
+  public int getRowGroupSizeRows() {
+    return rowGroupSizeRows;
+  }
+
+  public long getRowGroupSizeBytes() {
+    return rowGroupSizeBytes;
+  }
+
   public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
   public static class Builder extends CompressionMetadataWriterOptions.Builder
         <Builder, ParquetWriterOptions> {
+    private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group
+    private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
 
     public Builder() {
       super();
     }
 
+    public Builder withRowGroupSizeRows(int rowGroupSizeRows) {
+      this.rowGroupSizeRows = rowGroupSizeRows;
+      return this;
+    }
+
+    public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) {
+      this.rowGroupSizeBytes = rowGroupSizeBytes;
+      return this;
+    }
+
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index cbb126d7ee5..09da43374ae 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -332,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames,
 
   /**
    * Setup everything to write parquet formatted data to a file.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param filename        local output path
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param filename          local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
@@ -355,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
+                                                   int rowGroupSizeRows,
+                                                   long rowGroupSizeBytes,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
@@ -366,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames,
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param consumer        consumer of host buffers produced.
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param consumer          consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
@@ -389,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
+                                                     int rowGroupSizeRows,
+                                                     long rowGroupSizeBytes,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
@@ -1820,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
@@ -1840,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 40a111209b0..92e213bcb60 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2150,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
                                                   jobjectArray j_metadata_keys,
                                                   jobjectArray j_metadata_values,
                                                   jint j_compression,
+                                                  jint j_row_group_size_rows,
+                                                  jlong j_row_group_size_bytes,
                                                   jint j_stats_freq,
                                                   jbooleanArray j_isInt96,
                                                   jintArray j_precisions,
@@ -2205,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
@@ -2227,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
                                                 jobjectArray j_metadata_keys,
                                                 jobjectArray j_metadata_values,
                                                 jint j_compression,
+                                                jint j_row_group_size_rows,
+                                                jlong j_row_group_size_bytes,
                                                 jint j_stats_freq,
                                                 jbooleanArray j_isInt96,
                                                 jintArray j_precisions,
@@ -2280,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 56fe63598d9..830f2b33b32 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
-    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
+    ParquetWriterOptions optionsNoCompress =
+      optBuilder.withCompressionType(CompressionType.NONE)
+      .withRowGroupSizeRows(10000)
+      .withRowGroupSizeBytes(10000)
+      .build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
           .withDecimalColumn("_c7", 4)
           .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
+          .withRowGroupSizeRows(10000)
+          .withRowGroupSizeBytes(10000)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {

From 7285efbeee12fa7f327933bcf6a52726bfa07790 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 16 Sep 2024 18:41:27 -1000
Subject: [PATCH 10/32] Support drop_first in get_dummies (#16795)

closes #16791

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16795
---
 python/cudf/cudf/core/reshape.py      | 11 +++++++----
 python/cudf/cudf/tests/test_onehot.py | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 3d205957126..c026579b8b5 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -738,7 +738,8 @@ def get_dummies(
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
-        Right now this is NON-FUNCTIONAL argument in rapids.
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
     columns : sequence of str, optional
         Names of columns to encode. If not provided, will attempt to encode all
         columns. Note this is different from pandas default behavior, which
@@ -806,9 +807,6 @@ def get_dummies(
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
-    if drop_first:
-        raise NotImplementedError("drop_first is not supported yet")
-
     if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -862,6 +860,7 @@ def get_dummies(
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
+                    drop_first=drop_first,
                 )
                 result_data.update(col_enc_data)
             return cudf.DataFrame._from_data(result_data, index=data.index)
@@ -874,6 +873,7 @@ def get_dummies(
             prefix=prefix,
             prefix_sep=prefix_sep,
             dtype=dtype,
+            drop_first=drop_first,
         )
         return cudf.DataFrame._from_data(data, index=ser.index)
 
@@ -1256,6 +1256,7 @@ def _one_hot_encode_column(
     prefix: str | None,
     prefix_sep: str | None,
     dtype: Dtype | None,
+    drop_first: bool,
 ) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
@@ -1276,6 +1277,8 @@ def _one_hot_encode_column(
         )
     data = one_hot_encode(column, categories)
 
+    if drop_first and len(data):
+        data.pop(next(iter(data)))
     if prefix is not None and prefix_sep is not None:
         data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
     if dtype:
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cc17dc46e0a..e054143b438 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated():
     df = cudf.DataFrame(range(3))
     with pytest.warns(FutureWarning):
         cudf.get_dummies(df, cats={0: [0, 1, 2]})
+
+
+def test_get_dummies_drop_first_series():
+    result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True)
+    expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)
+    assert_eq(result, expected)
+
+
+def test_get_dummies_drop_first_dataframe():
+    result = cudf.get_dummies(
+        cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}),
+        drop_first=True,
+    )
+    expected = pd.get_dummies(
+        pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True
+    )
+    assert_eq(result, expected)

From 27c29ebd81864d1662dd8a3e8e807955bd8fd9c5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 09:17:43 -0500
Subject: [PATCH 11/32] Use cupy 12.2.0 as oldest dependency pinning on CUDA 12
 ARM (#16808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uses cupy 12.2.0 as oldest dependency pinning on ARM to ensure CUDA 12 support.

This will fix nightly CI failures that look like:

```
LibMambaUnsatisfiableError: Encountered problems while solving:
  - package cupy-12.0.0-py311h308989c_2 requires python_abi 3.11.* *_cp311, but none of the providers can be installed

Could not solve for environment specs
The following packages are incompatible
├─ cuda-version 12.2**  is installable and it requires
│  └─ cudatoolkit 12.2|12.2.* , which can be installed;
├─ cupy 12.0.0  is installable with the potential options
│  ├─ cupy 12.0.0 would require
│  │  └─ cudatoolkit >=11.2,<12 , which conflicts with any installable versions previously reported;
...
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16808
---
 dependencies.yaml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 483335c02ff..7a13043cc5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -710,7 +710,16 @@ dependencies:
               - numpy==1.23.*
               - pandas==2.0.*
               - pyarrow==14.0.0
-              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: conda
+        matrices:
+          - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"}
+            packages:
+              - cupy==12.2.0  # cupy 12.2.0 is the earliest with CUDA 12 ARM packages.
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy==12.0.0
           - matrix:
             packages:
       - output_types: requirements

From 23351aa15f5334b7582c53d4cb6b7421c5c2fd74 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:14:32 -0400
Subject: [PATCH 12/32] Word-based nvtext::minhash function (#15368)

Experimental implementation for #15055
The input is a lists column of strings where each string in each row is expected as a word to be hashed. The minimum hash for that row is returned in a lists column where each row contains a minhash per input hash seed.
Here the caller is expected to produce the words to be hashed.

```
std::unique_ptr<cudf::column> word_minhash(
  cudf::lists_column_view const& input,
  cudf::device_span<uint32_t const> seeds,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15368
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 cpp/benchmarks/text/word_minhash.cpp          |  77 +++++++++
 cpp/include/nvtext/minhash.hpp                |  61 +++++++-
 cpp/src/text/minhash.cu                       | 147 +++++++++++++++++-
 cpp/tests/text/minhash_tests.cpp              |  35 +++++
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  38 +++++
 python/cudf/cudf/_lib/strings/__init__.py     |   9 +-
 python/cudf/cudf/core/column/string.py        |  70 +++++++++
 .../cudf/cudf/tests/text/test_text_methods.py |  60 +++++++
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  10 ++
 10 files changed, 498 insertions(+), 11 deletions(-)
 create mode 100644 cpp/benchmarks/text/word_minhash.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 3bf9d02b384..6c5f4a68a4c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
new file mode 100644
index 00000000000..adc3dddc59c
--- /dev/null
+++ b/cpp/benchmarks/text/word_minhash.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/minhash.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_word_minhash(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
+
+  data_profile const strings_profile =
+    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+  auto strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+
+  auto const num_offsets = (num_rows / row_width) + 1;
+  auto offsets           = cudf::sequence(num_offsets,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(row_width));
+
+  auto source = cudf::make_lists_column(num_offsets - 1,
+                                        std::move(offsets),
+                                        std::move(strings_table->release().front()),
+                                        0,
+                                        rmm::device_buffer{});
+
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
+                         : nvtext::word_minhash(source->view(), seeds.view());
+  });
+}
+
+NVBENCH_BENCH(bench_word_minhash)
+  .set_name("word_minhash")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
+  .add_int64_axis("row_width", {10, 100, 1000})
+  .add_int64_axis("seed_count", {2, 25})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index c83a4260c19..7c909f1a948 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
@@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint32_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm though
+ * only the first 64-bits of the hash are used in computing the output.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash64(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 605582f28a6..a03a34f5fa7 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,8 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr int block_size = 256;
-  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+/**
+ * @brief Compute the minhash of each list row of strings for each seed
+ *
+ * This is a warp-per-row algorithm where parallel threads within a warp
+ * work on strings in a single list row.
+ *
+ * @tparam HashFunction hash function to use on each string
+ *
+ * @param d_input List of strings to process
+ * @param seeds Seeds for hashing each string
+ * @param d_hashes Minhash output values (one per row)
+ */
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
+                                     cudf::device_span<hash_value_type const> seeds,
+                                     hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = idx / cudf::detail::warp_size;
+
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const d_output = d_hashes + (row_idx * seeds.size());
+
+  // initialize hashes output for this row
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  if (lane_idx == 0) {
+    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  // each lane hashes a string from the input row
+  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
+    auto const hash_str =
+      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash string and store the min value
+      hash_value_type hv;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv = hasher(hash_str);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        hv = thrust::get<0>(hasher(hash_str));
+      }
+      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
+                                              cudf::device_span<hash_value_type const> seeds,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
+
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  minhash_word_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
+
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
@@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                   std::move(offsets),
                                   std::move(hashes),
                                   input.null_count(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  cudf::detail::copy_bitmask(input, stream, mr),
                                   stream,
                                   mr);
   // expect this condition to be very rare
@@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 }  // namespace detail
 
@@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash(input, seeds, stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash64(input, seeds, stream, mr);
+}
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..e23f3f6e7d8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, WordsMinHash)
+{
+  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto validity = cudf::test::iterators::null_at(1);
+
+  LCWS input(
+    {LCWS({"hello", "abcdéfgh"}),
+     LCWS{},
+     LCWS({"rapids", "moré", "test", "text"}),
+     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
+    validity);
+
+  auto view = cudf::lists_column_view(input);
+
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
+  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
+  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
+  LCW32 expected({LCW32{2069617641u, 1975382903u},
+                  LCW32{},
+                  LCW32{657297235u, 1010955999u},
+                  LCW32{644643885u, 310002789u}},
+                 validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
+  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
+                    LCW64{},
+                    LCW64{5331949571924938590ul, 2088583894581919741ul},
+                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5ee15d0e409..59cb8d51440 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash64(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 47a194c4fda..4bf8a9b1a8f 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
@@ -6,7 +6,12 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash, minhash64
+from cudf._lib.nvtext.minhash import (
+    minhash,
+    minhash64,
+    word_minhash,
+    word_minhash64,
+)
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 16e6908f308..e059917b0b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,6 +5349,76 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        >>> ls.str.word_minhash(seeds=seeds)
+        0     [21141582, 1232889953, 1268336794]
+        1    [962346254, 2321233602, 1354839212]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash(self._column, seeds_column)
+        )
+
+    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> ls.str.word_minhash64(seeds)
+        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
+        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash64(self._column, seeds_column)
+        )
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 52179f55da3..997ca357986 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -946,6 +946,66 @@ def test_minhash():
         strings.str.minhash64(seeds=seeds)
 
 
+def test_word_minhash():
+    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
+            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash(seeds=seeds)
+    assert_eq(expected, actual)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([2603139454418834912], dtype=np.uint64),
+            cudf.Series([5240044617220523711], dtype=np.uint64),
+        ]
+    )
+    actual = ls.str.word_minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    2603139454418834912,
+                    8644371945174847701,
+                    5541030711534384340,
+                ],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [5240044617220523711, 5847101123925041457, 153762819128779913],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = ls.str.word_minhash64(seeds=seeds)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        ls.str.word_minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        ls.str.word_minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        ls.str.word_minhash64(seeds=seeds)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 0c352a5068b..f2dd22f43aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &seeds,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +

From e98e10981fc245a6837a51e9b6c2b933a5d7acd8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:19:40 -0400
Subject: [PATCH 13/32] Support multiple new-line characters in regex APIs
 (#15961)

Add support for multiple new-line characters for BOL (`^` / `\A`) and EOL (`$` / `\Z`):
-  `\n` line-feed (already supported)
-  `\r` carriage-return
-  `\u0085` next line (NEL)
-  `\u2028` line separator
-  `\u2029` paragraph separator

Reference #15746

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Navin Kumar (https://github.com/NVnavkumar)

URL: https://github.com/rapidsai/cudf/pull/15961
---
 cpp/doxygen/regex.md                      |  6 +++
 cpp/include/cudf/strings/regex/flags.hpp  | 20 ++++++--
 cpp/include/cudf/strings/string_view.cuh  | 11 +++--
 cpp/src/strings/regex/regcomp.cpp         | 21 ++++++--
 cpp/src/strings/regex/regex.inl           | 46 +++++++++++++-----
 cpp/tests/strings/contains_tests.cpp      | 59 +++++++++++++++++++++++
 cpp/tests/strings/extract_tests.cpp       | 40 +++++++++++++++
 cpp/tests/strings/findall_tests.cpp       | 28 +++++++++++
 cpp/tests/strings/replace_regex_tests.cpp | 49 +++++++++++++++++++
 cpp/tests/strings/special_chars.h         | 25 ++++++++++
 10 files changed, 281 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/strings/special_chars.h

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 8d206f245dc..6d1c91a5752 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
+- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
+- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
+- Next line (Unicode: `0085`, UTF-8: `C285`)
+- Carriage return (Unicode: `000D`, UTF-8: `0D`)
+
 **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
 - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
 - Unmatched paired special characters like `()`, `[]`, and `{}`.
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f7108129dee..4f3fc7086f2 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -35,10 +35,11 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,   ///< default
-  MULTILINE = 8,   ///< the '^' and '$' honor new-line characters
-  DOTALL    = 16,  ///< the '.' matching includes new-line characters
-  ASCII     = 256  ///< use only ASCII when matching built-in character classes
+  DEFAULT     = 0,    ///< default
+  MULTILINE   = 8,    ///< the '^' and '$' honor new-line characters
+  DOTALL      = 16,   ///< the '.' matching includes new-line characters
+  ASCII       = 256,  ///< use only ASCII when matching built-in character classes
+  EXT_NEWLINE = 512   ///< new-line matches extended characters
 };
 
 /**
@@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
   return (f & regex_flags::ASCII) == regex_flags::ASCII;
 }
 
+/**
+ * @brief Returns true if the given flags contain EXT_NEWLINE
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes EXT_NEWLINE
+ */
+constexpr bool is_ext_newline(regex_flags const f)
+{
+  return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
+}
+
 /**
  * @brief Capture groups setting
  *
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index abb26d7ccb4..14695c3bb27 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
 
 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
 {
-  if (byte_pos > 0)
-    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
-      ;
+  if (byte_pos > 0) {
+    if (byte_pos == char_pos) {
+      --byte_pos;
+    } else {
+      while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+        ;
+    }
+  }
   --char_pos;
   return *this;
 }
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index adf650a4f27..7c4c89bd3fb 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -539,15 +539,26 @@ class regex_parser {
                                                          : static_cast<int32_t>(LBRA);
       case ')': return RBRA;
       case '^': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return BOL;
       }
       case '$': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return EOL;
       }
       case '[': return build_cclass();
-      case '.': return dot_type;
+      case '.': {
+        _chr = is_ext_newline(_flags) ? 'N' : chr;
+        return dot_type;
+      }
     }
 
     if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
@@ -959,7 +970,7 @@ class regex_compiler {
       _prog.inst_at(inst_id).u1.cls_id = class_id;
     } else if (token == CHAR) {
       _prog.inst_at(inst_id).u1.c = yy;
-    } else if (token == BOL || token == EOL) {
+    } else if (token == BOL || token == EOL || token == ANY) {
       _prog.inst_at(inst_id).u1.c = yy;
     }
     push_and(inst_id, inst_id);
@@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
       case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
       case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
       case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
-      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
       case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
       case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3b899e4edc1..e34a1e12015 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
   list2    = tmp;
 }
 
+/**
+ * @brief Check for supported new-line characters
+ *
+ * '\n, \r, \u0085, \u2028, or \u2029'
+ */
+constexpr bool is_newline(char32_t const ch)
+{
+  return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
+}
+
 /**
  * @brief Utility to check a specific character against this class instance.
  *
@@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     if (checkstart) {
       auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case BOL:
-          if (pos == 0) break;
-          if (jnk.startchar != '^') { return cuda::std::nullopt; }
+        case BOL: {
+          if (pos == 0) { break; }
+          if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
+          if (startchar != '\n') { break; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
+          [[fallthrough]];
+        }
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
           if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
@@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
             id_activate = inst.u2.next_id;
             expanded    = true;
             break;
-          case BOL:
-            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+          case BOL: {
+            auto titr         = itr;
+            auto const prev_c = pos > 0 ? *(--titr) : 0;
+            if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
+                ((inst.u1.c == 'S') && (is_newline(prev_c)))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          case EOL:
+          }
+          case EOL: {
             // after the last character OR:
             // - for MULTILINE, if current character is new-line
             // - for non-MULTILINE, the very last character of the string can also be a new-line
+            bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
             if (last_character ||
-                ((c == '\n') && (inst.u1.c != 'Z') &&
-                 ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
+                (nl && (inst.u1.c != 'Z') &&
+                 ((inst.u1.c == '$' || inst.u1.c == 'S') ||
+                  (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
+          }
           case BOW:
           case NBOW: {
-            auto const prev_c       = pos > 0 ? dstr[pos - 1] : 0;
+            auto titr               = itr;
+            auto const prev_c       = pos > 0 ? *(--titr) : 0;
             auto const word_class   = reclass_device{CCLASS_W};
             bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
             bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
@@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case CHAR:
           if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
-        case ANY:
-          if (c != '\n') id_activate = inst.u2.next_id;
-          break;
+        case ANY: {
+          if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
+          [[fallthrough]];
+        }
         case ANYNL: id_activate = inst.u2.next_id; break;
         case NCCLASS:
         case CCLASS: {
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c816316d0ff..acf850c7a66 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
+TEST_F(StringsContainsTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
+                                                   "qqq\rzzé" LINE_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" PARAGRAPH_SEPARATOR,
+                                                   "abc\nzzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                          cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml  = cudf::strings::regex_program::create(pattern, ml_flags);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  auto results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
+  results  = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
+  results     = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+  counts  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+
+  pattern  = std::string("q.*l");
+  prog     = cudf::strings::regex_program::create(pattern);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // inst ANY will stop matching on first 'newline' and so should not match anything here
+  prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // including the DOTALL flag accepts the newline characters
+  auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                           cudf::strings::regex_flags::DOTALL);
+  prog           = cudf::strings::regex_program::create(pattern, dot_flags);
+  expected       = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results        = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EndOfString)
 {
   auto input = cudf::test::strings_column_wrapper(
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index b26cbd5a549..1491da758d5 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsExtractTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
+                                                   "qqq" LINE_SEPARATOR "zzé\rlll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" NEXT_LINE,
+                                                   "abc" PARAGRAPH_SEPARATOR "zzé\n"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::extract(view, *prog);
+  auto expected =
+    cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::extract(view, *prog_ml);
+  expected =
+    cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  prog = cudf::strings::regex_program::create("q(q.*l)l");
+  expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
+                                                {0, 1, 0, 0, 0, 0});
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+  // expect no matches here since the newline(s) interrupts the pattern
+  prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4582dcb1e38..47606b9b3ed 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
+                                                   "qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé\r",
+                                                   "zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::findall(view, *prog);
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::findall(view, *prog_ml);
+  LCW expected_ml(
+    {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 8c0482653fb..9847d8d6bb5 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
+TEST_F(StringsReplaceRegexTest, SpecialNewLines)
+{
+  auto input   = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                     "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                     "zzé",
+                                                     "",
+                                                     "zzé" PARAGRAPH_SEPARATOR,
+                                                     "abc\rzzé\r"});
+  auto view    = cudf::strings_column_view(input);
+  auto repl    = cudf::string_scalar("_");
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results  = cudf::strings::replace_re(view, *prog, repl);
+  auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                      "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                      "_",
+                                                      "",
+                                                      "_" PARAGRAPH_SEPARATOR,
+                                                      "abc\rzzé\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags);
+  results      = cudf::strings::replace_re(view, *prog_ml, repl);
+  expected     = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_",
+                                                     "qqq" NEXT_LINE "_" NEXT_LINE "lll",
+                                                     "_",
+                                                     "",
+                                                     "_" PARAGRAPH_SEPARATOR,
+                                                     "abc\r_\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^zzé$)");
+  prog               = cudf::strings::regex_program::create(pattern, both_flags);
+  results            = cudf::strings::replace_with_backrefs(view, *prog, repl_template);
+  expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]",
+                                                 "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll",
+                                                 "[zzé]",
+                                                 "",
+                                                 "[zzé]" PARAGRAPH_SEPARATOR,
+                                                 "abc\r[zzé]\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h
new file mode 100644
index 00000000000..0d630f6bb52
--- /dev/null
+++ b/cpp/tests/strings/special_chars.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::test {
+
+// special new-line characters for use with regex_flags::EXT_NEWLINE
+#define NEXT_LINE           "\xC2\x85"
+#define LINE_SEPARATOR      "\xE2\x80\xA8"
+#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9"
+
+}  // namespace cudf::test

From a112f684318e24b2321df48004ca58180f169410 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:31:38 -0700
Subject: [PATCH 14/32] Add io_type axis with default `PINNED_BUFFER` to
 nvbench PQ multithreaded reader (#16809)

Closes #16758

This PR adds an `io_type` axis to the benchmarks in `PARQUET_MULTITHREAD_READER_NVBENCH` with `PINNED_BUFFER` as default value. More description at #16758.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/16809
---
 .../io/parquet/parquet_reader_multithread.cpp | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 3abd4280081..7121cb9f034 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 }
 
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
-  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
 {
   cudf::size_type const cardinality = state.get_int64("cardinality");
   cudf::size_type const run_length  = state.get_int64("run_length");
@@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_source_type};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 {
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
 
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto const num_threads    = state.get_int64("num_threads");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .set_name("parquet_multithreaded_read_decode_fixed_width")
@@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .set_name("parquet_multithreaded_read_decode_string")
@@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .set_name("parquet_multithreaded_read_decode_list")
@@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 // mixed data types: fixed width, strings
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
@@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
@@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .set_name("parquet_multithreaded_read_decode_chunked_string")
@@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .set_name("parquet_multithreaded_read_decode_chunked_list")
@@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});

From 57ae3e372e93a16db8aef143759ef58392c4215f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 18 Sep 2024 02:10:58 -0500
Subject: [PATCH 15/32] Enable cudf.pandas REPL and -c command support (#16428)

This PR enables support for two features:
- `python -m cudf.pandas` gives a REPL experience (previously it raised an error)
- `python -m cudf.pandas -c "<commands>"` runs the provided commands (previously unsupported)

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16428
---
 docs/cudf/source/cudf_pandas/usage.md      |  20 +++++
 python/cudf/cudf/pandas/__main__.py        |  36 +++++++-
 python/cudf/cudf_pandas_tests/test_main.py | 100 +++++++++++++++++++++
 3 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/test_main.py

diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 0398a8d7086..41838e01dd9 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the
 ```bash
 python -m cudf.pandas --profile script.py
 ```
+
+### cudf.pandas CLI Features
+
+Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin.
+
+Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter.
+
+The `-c` flag accepts a code string to run, like this:
+
+```bash
+$ python -m cudf.pandas -c "import pandas; print(pandas)"
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
+
+Users can also provide code to execute from stdin, like this:
+
+```bash
+$ echo "import pandas; print(pandas)" | python -m cudf.pandas
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 3a82829eb7a..e0d3d9101a9 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import code
 import runpy
 import sys
 import tempfile
@@ -21,6 +22,8 @@
 
 @contextmanager
 def profile(function_profile, line_profile, fn):
+    if fn is None and (line_profile or function_profile):
+        raise RuntimeError("Enabling the profiler requires a script name.")
     if line_profile:
         with open(fn) as f:
             lines = f.readlines()
@@ -54,6 +57,11 @@ def main():
         dest="module",
         nargs=1,
     )
+    parser.add_argument(
+        "-c",
+        dest="cmd",
+        nargs=1,
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -72,9 +80,18 @@ def main():
 
     args = parser.parse_args()
 
+    if args.cmd:
+        f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py")
+        f.write(args.cmd[0].encode())
+        f.seek(0)
+        args.args.insert(0, f.name)
+
     install()
-    with profile(args.profile, args.line_profile, args.args[0]) as fn:
-        args.args[0] = fn
+
+    script_name = args.args[0] if len(args.args) > 0 else None
+    with profile(args.profile, args.line_profile, script_name) as fn:
+        if script_name is not None:
+            args.args[0] = fn
         if args.module:
             (module,) = args.module
             # run the module passing the remaining arguments
@@ -85,6 +102,21 @@ def main():
             # Remove ourself from argv and continue
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
+        else:
+            if sys.stdin.isatty():
+                banner = f"Python {sys.version} on {sys.platform}"
+                site_import = not sys.flags.no_site
+                if site_import:
+                    cprt = 'Type "help", "copyright", "credits" or "license" for more information.'
+                    banner += "\n" + cprt
+            else:
+                # Don't show prompts or banners if stdin is not a TTY
+                sys.ps1 = ""
+                sys.ps2 = ""
+                banner = ""
+
+            # Launch an interactive interpreter
+            code.interact(banner=banner, exitmsg="")
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
new file mode 100644
index 00000000000..326224c8fc0
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import tempfile
+import textwrap
+
+
+def _run_python(*, cudf_pandas, command):
+    executable = "python "
+    if cudf_pandas:
+        executable += "-m cudf.pandas "
+    return subprocess.run(
+        executable + command,
+        shell=True,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def test_run_cudf_pandas_with_script():
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f:
+        code = textwrap.dedent(
+            """
+            import pandas as pd
+            df = pd.DataFrame({'a': [1, 2, 3]})
+            print(df['a'].sum())
+            """
+        )
+        f.write(code)
+        f.flush()
+
+        res = _run_python(cudf_pandas=True, command=f.name)
+        expect = _run_python(cudf_pandas=False, command=f.name)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args():
+    input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
+    """Verify that cudf is active with -m cudf.pandas."""
+    input_args_and_code = """-c 'import pandas as pd; print(pd)'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert "cudf" in res.stdout
+    assert "cudf" not in expect.stdout
+
+
+def test_cudf_pandas_script_repl():
+    def start_repl_process(cmd):
+        return subprocess.Popen(
+            cmd.split(),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    def get_repl_output(process, commands):
+        for command in commands:
+            process.stdin.write(command)
+            process.stdin.flush()
+        return process.communicate()
+
+    p1 = start_repl_process("python -m cudf.pandas")
+    p2 = start_repl_process("python")
+    commands = [
+        "import pandas as pd\n",
+        "print(pd.Series(range(2)).sum())\n",
+        "print(pd.Series(range(5)).sum())\n",
+        "import sys\n",
+        "print(pd.Series(list('abcd')), out=sys.stderr)\n",
+    ]
+
+    res = get_repl_output(p1, commands)
+    expect = get_repl_output(p2, commands)
+
+    # Check stdout
+    assert res[0] != ""
+    assert res[0] == expect[0]
+
+    # Check stderr
+    assert res[1] != ""
+    assert res[1] == expect[1]
+
+    p1.kill()
+    p2.kill()

From 44a9c10105ab06538264e727188a04d623b0811e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 01:25:59 -0700
Subject: [PATCH 16/32] Add a benchmark to study Parquet reader's performance
 for wide tables (#16751)

Related to #16750

This PR adds a benchmark to study read throughput of Parquet reader for wide tables.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16751
---
 .../io/parquet/parquet_reader_input.cpp       | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 7563c823454..ce115fd7723 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))

From 2a9a8f5b95ea62824147f1629de1fe52fdbf1254 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 09:02:41 -0500
Subject: [PATCH 17/32] use get-pr-info from nv-gha-runners (#16819)

There are two implementations of the same action; one in
[rapidsai/shared-actions](https://github.com/rapidsai/shared-actions/tree/main/get-pr-info)
and [the other](https://github.com/nv-gha-runners/get-pr-info) in the
nv-gha-runners org. This PR switches to the implementation in the
nv-gha-runners group in order to keep a single source of truth.

Tested in
https://github.com/rapidsai/cudf/actions/runs/10906617425/job/30268277178?pr=16819#step:4:5
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..d7d14ea12ff 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:

From 2a3026dec9dca553c2be7d49f2d0e6c09a9f4589 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:04:31 -0700
Subject: [PATCH 18/32] Change the Parquet writer's
 `default_row_group_size_bytes` from 128MB to inf (#16750)

Closes #16733.

This PR changes the default value of Parquet writer's default max row group size from 128MB to 1Million rows. This allows avoiding thin row group strips when writing wide (> 512 cols) tables resulting in a significantly improved read throughput for wide tables (especially when low cardinality) with virtually no impact to narrow-tables read performance.

Benchmarked using: #16751

## Results

### Hardware
```
GPU: NVIDIA RTX 5880 Ada Generation
SM Version: 890 (PTX Version: 860)
Number of SMs: 110
SM Default Clock Rate: 18446744071874 MHz
Global Memory: 23879 MiB Free / 48632 MiB Total
Global Memory Bus Peak: 960 GB/sec (384-bit DDR @10001MHz)
Max Shared Memory: 100 KiB/SM, 48 KiB/Block
L2 Cache Size: 98304 KiB
Maximum Active Blocks: 24/SM
Maximum Active Threads: 1536/SM, 1024/Block
Available Registers: 65536/SM, 65536/Block
ECC Enabled: No
```

### Read Throughput
```
## parquet_read_wide_tables_mixed

|     T     | num_rows | num_cols |  GPU Time_old  |  GPU Time_new  | bytes_per_second_old | bytes_per_second_new | peak_memory_usage_old | peak_memory_usage_new | encoded_file_size_old | encoded_file_size_new |
|-----------|----------|----------|----------------|----------------|----------------------|----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
|  INTEGRAL |    10000 |       64 |     940.690 us |     928.387 us |         570720378014 |         578283256754 |             3.405 MiB |             3.405 MiB |           748.248 KiB |           748.248 KiB |
|  INTEGRAL |   100000 |       64 |       2.053 ms |       2.037 ms |         261541794543 |         263500220325 |            28.308 MiB |            28.308 MiB |             5.164 MiB |             5.164 MiB |
|  INTEGRAL |   500000 |       64 |       5.783 ms |       5.693 ms |          92838553328 |          94296134644 |           139.928 MiB |           139.042 MiB |            24.698 MiB |            24.325 MiB |
|  INTEGRAL |  1000000 |       64 |      11.400 ms |      10.775 ms |          47092763803 |          49824643807 |           279.254 MiB |           277.470 MiB |            49.042 MiB |            48.284 MiB |
|  INTEGRAL |    10000 |      256 |       1.718 ms |       1.732 ms |         312407306091 |         309935794547 |            13.752 MiB |            13.752 MiB |             2.956 MiB |             2.956 MiB |
|  INTEGRAL |   100000 |      256 |       5.726 ms |       5.818 ms |          93765292338 |          92275580643 |           114.366 MiB |           114.366 MiB |            20.743 MiB |            20.743 MiB |
|  INTEGRAL |   500000 |      256 |      25.179 ms |      22.159 ms |          21322289603 |          24228371776 |           572.905 MiB |           561.786 MiB |           103.796 MiB |            97.677 MiB |
|  INTEGRAL |  1000000 |      256 |      48.259 ms |      42.428 ms |          11124725758 |          12653746472 |             1.117 GiB |             1.095 GiB |           206.155 MiB |           193.886 MiB |
|  INTEGRAL |    10000 |      512 |       2.741 ms |       2.758 ms |         195853280055 |         194632437549 |            27.508 MiB |            27.508 MiB |             5.918 MiB |             5.918 MiB |
|  INTEGRAL |   100000 |      512 |      11.197 ms |      10.600 ms |          47945685016 |          50646524148 |           235.910 MiB |           228.755 MiB |            44.559 MiB |            41.510 MiB |
|  INTEGRAL |   500000 |      512 |      54.929 ms |      43.554 ms |           9773962645 |          12326557981 |             1.146 GiB |             1.097 GiB |           221.266 MiB |           195.384 MiB |
|  INTEGRAL |  1000000 |      512 |     103.779 ms |      82.403 ms |           5173195193 |           6515218035 |             2.288 GiB |             2.190 GiB |           442.101 MiB |           387.861 MiB |
|  INTEGRAL |    10000 |     1024 |       5.210 ms |       5.405 ms |         103040438112 |          99319591295 |            54.937 MiB |            54.937 MiB |            11.829 MiB |            11.829 MiB |
|  INTEGRAL |   100000 |     1024 |      26.891 ms |      20.194 ms |          19964357393 |          26585391032 |           498.410 MiB |           456.756 MiB |            99.962 MiB |            82.939 MiB |
|  INTEGRAL |   500000 |     1024 |     135.404 ms |      84.676 ms |           3964957208 |           6340314329 |             2.434 GiB |             2.191 GiB |           500.554 MiB |           390.418 MiB |
|  INTEGRAL |  1000000 |     1024 |     256.033 ms |     162.217 ms |           2096879057 |           3309593393 |             4.869 GiB |             4.372 GiB |          1001.573 MiB |           775.040 MiB |
|     FLOAT |    10000 |       64 |     962.219 us |     951.565 us |         557950915640 |         564197923891 |             5.275 MiB |             5.275 MiB |          1012.101 KiB |          1012.101 KiB |
|     FLOAT |   100000 |       64 |       2.032 ms |       2.032 ms |         264218700681 |         264250413360 |            45.321 MiB |            45.321 MiB |             6.316 MiB |             6.316 MiB |
|     FLOAT |   500000 |       64 |       6.660 ms |       6.693 ms |          80611279094 |          80219014175 |           224.129 MiB |           222.946 MiB |            29.685 MiB |            29.044 MiB |
|     FLOAT |  1000000 |       64 |      13.560 ms |      13.758 ms |          39591771965 |          39023315442 |           447.103 MiB |           445.007 MiB |            58.762 MiB |            57.482 MiB |
|     FLOAT |    10000 |      256 |       1.808 ms |       1.825 ms |         297020886609 |         294226222306 |            21.109 MiB |            21.109 MiB |             3.968 MiB |             3.968 MiB |
|     FLOAT |   100000 |      256 |       6.921 ms |       6.307 ms |          77571490752 |          85116522574 |           185.578 MiB |           181.271 MiB |            27.393 MiB |            25.256 MiB |
|     FLOAT |   500000 |      256 |      30.064 ms |      25.955 ms |          17857874786 |          20684696586 |           914.366 MiB |           891.787 MiB |           128.981 MiB |           116.186 MiB |
|     FLOAT |  1000000 |      256 |      59.189 ms |      48.592 ms |           9070460126 |          11048464794 |             1.787 GiB |             1.738 GiB |           258.075 MiB |           229.920 MiB |
|     FLOAT |    10000 |      512 |       2.998 ms |       3.006 ms |         179078195058 |         178594968077 |            42.222 MiB |            42.222 MiB |             7.941 MiB |             7.941 MiB |
|     FLOAT |   100000 |      512 |      14.160 ms |      12.314 ms |          37915291403 |          43597041127 |           376.553 MiB |           362.567 MiB |            60.136 MiB |            50.537 MiB |
|     FLOAT |   500000 |      512 |      69.524 ms |      50.251 ms |           7722076774 |          10683715204 |             1.826 GiB |             1.742 GiB |           292.552 MiB |           232.393 MiB |
|     FLOAT |  1000000 |      512 |     130.729 ms |      95.458 ms |           4106742786 |           5624164002 |             3.647 GiB |             3.477 GiB |           581.180 MiB |           459.927 MiB |
|     FLOAT |    10000 |     1024 |       6.351 ms |       6.492 ms |          84532884515 |          82693769317 |            84.452 MiB |            84.452 MiB |            15.893 MiB |            15.893 MiB |
|     FLOAT |   100000 |     1024 |      36.898 ms |      26.302 ms |          14550146722 |          20411596018 |           778.441 MiB |           725.125 MiB |           136.809 MiB |           101.066 MiB |
|     FLOAT |   500000 |     1024 |     166.699 ms |      98.340 ms |           3220600409 |           5459311820 |             3.802 GiB |             3.484 GiB |           685.702 MiB |           464.775 MiB |
|     FLOAT |  1000000 |     1024 |     339.687 ms |     188.463 ms |           1580487011 |           2848673918 |             7.606 GiB |             6.953 GiB |             1.340 GiB |           919.840 MiB |
|   DECIMAL |    10000 |       64 |       1.076 ms |       1.092 ms |         498752693210 |         491676757508 |             7.485 MiB |             7.485 MiB |             1.216 MiB |             1.216 MiB |
|   DECIMAL |   100000 |       64 |       2.166 ms |       2.172 ms |         247840684988 |         247198078197 |            65.498 MiB |            65.498 MiB |             6.658 MiB |             6.658 MiB |
|   DECIMAL |   500000 |       64 |       7.421 ms |       7.058 ms |          72343289850 |          76066836305 |           325.515 MiB |           322.466 MiB |            31.349 MiB |            29.384 MiB |
|   DECIMAL |  1000000 |       64 |      15.239 ms |      14.020 ms |          35230516583 |          38291860266 |           649.547 MiB |           643.714 MiB |            61.759 MiB |            57.826 MiB |
|   DECIMAL |    10000 |      256 |       1.989 ms |       1.989 ms |         269930562597 |         269886680781 |            30.119 MiB |            30.119 MiB |             4.896 MiB |             4.896 MiB |
|   DECIMAL |   100000 |      256 |       7.839 ms |       6.966 ms |          68483613468 |          77073587059 |           269.638 MiB |           263.547 MiB |            30.588 MiB |            26.664 MiB |
|   DECIMAL |   500000 |      256 |      35.199 ms |      26.893 ms |          15252335676 |          19963411264 |             1.312 GiB |             1.267 GiB |           150.948 MiB |           117.601 MiB |
|   DECIMAL |  1000000 |      256 |      72.584 ms |      50.944 ms |           7396511691 |          10538553316 |             2.622 GiB |             2.529 GiB |           301.231 MiB |           231.353 MiB |
|   DECIMAL |    10000 |      512 |       3.612 ms |       3.595 ms |         148642296188 |         149335059500 |            60.283 MiB |            60.283 MiB |             9.801 MiB |             9.801 MiB |
|   DECIMAL |   100000 |      512 |      19.820 ms |      14.084 ms |          27087819156 |          38119174003 |           562.417 MiB |           527.494 MiB |            75.263 MiB |            53.349 MiB |
|   DECIMAL |   500000 |      512 |      94.913 ms |      51.910 ms |           5656452419 |          10342308581 |             2.747 GiB |             2.536 GiB |           377.112 MiB |           235.187 MiB |
|   DECIMAL |  1000000 |      512 |     180.513 ms |      98.562 ms |           2974131976 |           5447057883 |             5.494 GiB |             5.063 GiB |           754.738 MiB |           462.785 MiB |
|   DECIMAL |    10000 |     1024 |       7.667 ms |       6.777 ms |          70025338013 |          79218913933 |           120.656 MiB |           120.656 MiB |            19.616 MiB |            19.616 MiB |
|   DECIMAL |   100000 |     1024 |      61.182 ms |      26.946 ms |           8775038947 |          19923803470 |             1.184 GiB |             1.031 GiB |           201.928 MiB |           106.705 MiB |
|   DECIMAL |   500000 |     1024 |     261.218 ms |     102.314 ms |           2055261558 |           5247292283 |             5.921 GiB |             5.076 GiB |          1012.826 MiB |           470.402 MiB |
|   DECIMAL |  1000000 |     1024 |     513.386 ms |     196.347 ms |           1045744543 |           2734301880 |            11.843 GiB |            10.133 GiB |             1.980 GiB |           925.576 MiB |
| TIMESTAMP |    10000 |       64 |       1.014 ms |       1.016 ms |         529606978079 |         528414399822 |             6.079 MiB |             6.079 MiB |             1.068 MiB |             1.068 MiB |
| TIMESTAMP |   100000 |       64 |       2.057 ms |       2.053 ms |         261019684779 |         261455248599 |            52.688 MiB |            52.688 MiB |             6.436 MiB |             6.436 MiB |
| TIMESTAMP |   500000 |       64 |       6.950 ms |       6.761 ms |          77245644716 |          79410211533 |           260.606 MiB |           259.304 MiB |            29.924 MiB |            29.164 MiB |
| TIMESTAMP |  1000000 |       64 |      14.506 ms |      13.832 ms |          37010291008 |          38813599633 |           521.240 MiB |           517.604 MiB |            59.878 MiB |            57.601 MiB |
| TIMESTAMP |    10000 |      256 |       1.878 ms |       1.889 ms |         285887176743 |         284275145551 |            24.328 MiB |            24.328 MiB |             4.290 MiB |             4.290 MiB |
| TIMESTAMP |   100000 |      256 |       7.198 ms |       6.458 ms |          74586920018 |          83128450019 |           215.854 MiB |           210.739 MiB |            28.681 MiB |            25.734 MiB |
| TIMESTAMP |   500000 |      256 |      34.185 ms |      26.654 ms |          15705060785 |          20142331826 |             1.044 GiB |             1.013 GiB |           137.016 MiB |           116.663 MiB |
| TIMESTAMP |  1000000 |      256 |      66.420 ms |      49.599 ms |           8083007343 |          10824295857 |             2.085 GiB |             2.022 GiB |           272.580 MiB |           230.395 MiB |
| TIMESTAMP |    10000 |      512 |       3.143 ms |       3.150 ms |         170821086658 |         170446277893 |            48.702 MiB |            48.702 MiB |             8.591 MiB |             8.591 MiB |
| TIMESTAMP |   100000 |      512 |      17.652 ms |      12.615 ms |          30413872283 |          42557024194 |           440.115 MiB |           421.891 MiB |            63.197 MiB |            51.502 MiB |
| TIMESTAMP |   500000 |      512 |      75.454 ms |      50.955 ms |           7115233856 |          10536117334 |             2.146 GiB |             2.028 GiB |           315.073 MiB |           233.355 MiB |
| TIMESTAMP |  1000000 |      512 |     140.692 ms |      95.964 ms |           3815935506 |           5594485106 |             4.285 GiB |             4.048 GiB |           627.348 MiB |           460.885 MiB |
| TIMESTAMP |    10000 |     1024 |       6.436 ms |       6.975 ms |          83411903593 |          76971777095 |            97.454 MiB |            97.454 MiB |            17.196 MiB |            17.196 MiB |
| TIMESTAMP |   100000 |     1024 |      45.659 ms |      26.728 ms |          11758159876 |          20086145129 |           936.005 MiB |           844.159 MiB |           159.908 MiB |           103.000 MiB |
| TIMESTAMP |   500000 |     1024 |     199.636 ms |      99.231 ms |           2689242353 |           5410303529 |             4.557 GiB |             4.057 GiB |           794.728 MiB |           466.703 MiB |
| TIMESTAMP |  1000000 |     1024 |     372.691 ms |     192.598 ms |           1440523696 |           2787517681 |             9.104 GiB |             8.099 GiB |             1.551 GiB |           921.760 MiB |
|  DURATION |    10000 |       64 |     986.208 us |     989.153 us |         544379023579 |         542758221495 |             6.417 MiB |             6.417 MiB |           932.501 KiB |           932.501 KiB |
|  DURATION |   100000 |       64 |       2.222 ms |       2.018 ms |         241594183626 |         266034888500 |            57.291 MiB |            57.291 MiB |             6.079 MiB |             6.079 MiB |
|  DURATION |   500000 |       64 |       6.642 ms |       6.673 ms |          80830328889 |          80453377113 |           284.029 MiB |           283.224 MiB |            28.819 MiB |            28.288 MiB |
|  DURATION |  1000000 |       64 |      13.150 ms |      13.488 ms |          40828039129 |          39804805295 |           567.280 MiB |           565.669 MiB |            57.137 MiB |            56.075 MiB |
|  DURATION |    10000 |      256 |       1.805 ms |       1.815 ms |         297459887040 |         295856879191 |            25.686 MiB |            25.686 MiB |             3.665 MiB |             3.665 MiB |
|  DURATION |   100000 |      256 |       6.839 ms |       6.270 ms |          78502421937 |          85630914910 |           232.874 MiB |           229.165 MiB |            25.863 MiB |            24.323 MiB |
|  DURATION |   500000 |      256 |      29.886 ms |      26.234 ms |          17964080662 |          20464503730 |             1.125 GiB |             1.106 GiB |           123.885 MiB |           113.179 MiB |
|  DURATION |  1000000 |      256 |      58.290 ms |      48.418 ms |           9210348188 |          11088351436 |             2.250 GiB |             2.210 GiB |           247.272 MiB |           224.312 MiB |
|  DURATION |    10000 |      512 |       3.035 ms |       2.964 ms |         176885037888 |         181108374773 |            51.383 MiB |            51.383 MiB |             7.342 MiB |             7.342 MiB |
|  DURATION |   100000 |      512 |      14.492 ms |      12.136 ms |          37044853523 |          44237579412 |           474.355 MiB |           458.371 MiB |            55.996 MiB |            48.689 MiB |
|  DURATION |   500000 |      512 |      70.131 ms |      51.095 ms |           7655286246 |          10507294503 |             2.299 GiB |             2.213 GiB |           271.064 MiB |           226.438 MiB |
|  DURATION |  1000000 |      512 |     132.495 ms |      95.019 ms |           4051999205 |           5650150759 |             4.593 GiB |             4.419 GiB |           541.495 MiB |           448.815 MiB |
|  DURATION |    10000 |     1024 |       6.576 ms |       6.318 ms |          81638807422 |          84977253627 |           102.782 MiB |           102.782 MiB |            14.701 MiB |            14.701 MiB |
|  DURATION |   100000 |     1024 |      38.001 ms |      26.011 ms |          14127627316 |          20640219375 |           964.471 MiB |           916.755 MiB |           127.532 MiB |            97.394 MiB |
|  DURATION |   500000 |     1024 |     159.928 ms |      98.126 ms |           3356945213 |           5471258270 |             4.711 GiB |             4.426 GiB |           639.050 MiB |           452.925 MiB |
|  DURATION |  1000000 |     1024 |     305.818 ms |     188.647 ms |           1755524869 |           2845895428 |             9.422 GiB |             8.839 GiB |             1.249 GiB |           897.737 MiB |
|    STRING |    10000 |       64 |       2.241 ms |       2.244 ms |         239611491431 |         239240518530 |            15.926 MiB |            15.926 MiB |             2.075 MiB |             2.075 MiB |
|    STRING |   100000 |       64 |       4.862 ms |       4.822 ms |         110419679907 |         111346705245 |           132.646 MiB |           132.646 MiB |             8.087 MiB |             8.087 MiB |
|    STRING |   500000 |       64 |      20.498 ms |      17.812 ms |          26191957819 |          30140554720 |           664.294 MiB |           645.028 MiB |            40.456 MiB |            30.817 MiB |
|    STRING |  1000000 |       64 |      37.773 ms |      34.985 ms |          14213079575 |          15345709268 |             1.298 GiB |             1.255 GiB |            80.941 MiB |            59.259 MiB |
|    STRING |    10000 |      256 |       4.125 ms |       4.171 ms |         130163506067 |         128706550148 |            63.789 MiB |            63.789 MiB |             8.319 MiB |             8.319 MiB |
|    STRING |   100000 |      256 |      22.074 ms |      17.799 ms |          24321103825 |          30162947098 |           584.754 MiB |           530.912 MiB |            58.602 MiB |            32.330 MiB |
|    STRING |   500000 |      256 |      93.278 ms |      66.770 ms |           5755572906 |           8040584271 |             2.857 GiB |             2.521 GiB |           294.130 MiB |           123.271 MiB |
|    STRING |  1000000 |      256 |     190.999 ms |     122.359 ms |           2810851154 |           4387682165 |             5.715 GiB |             5.023 GiB |           588.586 MiB |           237.018 MiB |
|    STRING |    10000 |      512 |       7.520 ms |       8.010 ms |          71390390607 |          67021971176 |           127.538 MiB |           127.538 MiB |            16.634 MiB |            16.634 MiB |
|    STRING |   100000 |      512 |      51.666 ms |      32.251 ms |          10391219810 |          16646741143 |             1.259 GiB |             1.037 GiB |           173.940 MiB |            64.682 MiB |
|    STRING |   500000 |      512 |     251.723 ms |     125.963 ms |           2132782858 |           4262141577 |             6.300 GiB |             5.040 GiB |           873.437 MiB |           246.559 MiB |
|    STRING |  1000000 |      512 |     477.668 ms |     244.912 ms |           1123940871 |           2192101011 |            12.602 GiB |            10.044 GiB |             1.707 GiB |           474.121 MiB |
|    STRING |    10000 |     1024 |      17.184 ms |      16.128 ms |          31242201518 |          33288874029 |           276.395 MiB |           254.971 MiB |            40.126 MiB |            33.243 MiB |
|    STRING |   100000 |     1024 |     132.094 ms |      63.304 ms |           4064323158 |           8480799642 |             2.721 GiB |             2.073 GiB |           414.092 MiB |           129.316 MiB |
|    STRING |   500000 |     1024 |     608.283 ms |     251.026 ms |            882600977 |           2138709222 |            13.618 GiB |            10.076 GiB |             2.028 GiB |           493.067 MiB |
|    STRING |  1000000 |     1024 |        1.249 s |     485.734 ms |            429750505 |           1105276473 |            27.239 GiB |            20.079 GiB |             4.059 GiB |           948.185 MiB |
```

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16750
---
 cpp/include/cudf/io/parquet.hpp          |  5 +++--
 cpp/src/io/parquet/writer_impl.cu        | 10 ++++++++--
 python/cudf/cudf/_lib/parquet.pyx        | 16 ++++++++--------
 python/cudf/cudf/core/dataframe.py       |  2 +-
 python/cudf/cudf/io/parquet.py           |  8 ++++----
 python/cudf/cudf/utils/ioutils.py        | 12 ++++--------
 python/dask_cudf/dask_cudf/io/parquet.py |  7 ++-----
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ed7b2ac0850..ee03a382bec 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -39,8 +39,9 @@ namespace io {
  * @file
  */
 
-constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  ///< 128MB per row group
-constexpr size_type default_row_group_size_rows = 1000000;     ///< 1 million rows per row group
+constexpr size_t default_row_group_size_bytes =
+  std::numeric_limits<size_t>::max();                          ///< Infinite bytes per row group
+constexpr size_type default_row_group_size_rows = 1'000'000;   ///< 1 million rows per row group
 constexpr size_t default_max_page_size_bytes    = 512 * 1024;  ///< 512KB per page
 constexpr size_type default_max_page_size_rows  = 20000;       ///< 20k rows per page
 constexpr int32_t default_column_index_truncate_length = 64;   ///< truncate to 64 bytes
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 81fd4ab9f82..ec05f35d405 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto const table_size  = std::reduce(column_sizes.begin(), column_sizes.end());
     auto const avg_row_len = util::div_rounding_up_safe<size_t>(table_size, input.num_rows());
     if (avg_row_len > 0) {
-      auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len);
-      max_page_fragment_size  = std::min<size_type>(rg_frag_size, max_page_fragment_size);
+      // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size
+      // value (=uint64::max) to avoid a sign overflow when comparing
+      auto const rg_frag_size =
+        std::min<size_t>(std::numeric_limits<size_type>::max(),
+                         util::div_rounding_up_safe(max_row_group_size, avg_row_len));
+      // Safe comparison as rg_frag_size fits in size_type
+      max_page_fragment_size =
+        std::min<size_type>(static_cast<size_type>(rg_frag_size), max_page_fragment_size);
     }
 
     // dividing page size by average row length will tend to overshoot the desired
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a0155671a26..e6c9d60b05b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -438,7 +438,7 @@ def write_parquet(
     object statistics="ROWGROUP",
     object metadata_file_path=None,
     object int96_timestamps=False,
-    object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+    object row_group_size_bytes=None,
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
@@ -616,9 +616,9 @@ cdef class ParquetWriter:
         Name of the compression to use. Use ``None`` for no compression.
     statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default 134217728
+    row_group_size_bytes: int, default ``uint64 max``
         Maximum size of each stripe of the output.
-        By default, 134217728 (128MB) will be used.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
     row_group_size_rows: int, default 1000000
         Maximum number of rows of each stripe of the output.
         By default, 1000000 (10^6 rows) will be used.
@@ -661,11 +661,11 @@ cdef class ParquetWriter:
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
-                  int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  int row_group_size_rows=1000000,
-                  int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000,
-                  int max_dictionary_size=1048576,
+                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+                  size_type row_group_size_rows=1000000,
+                  size_t max_page_size_bytes=524288,
+                  size_type max_page_size_rows=20000,
+                  size_t max_dictionary_size=1048576,
                   bool use_dictionary=True,
                   bool store_schema=False):
         filepaths_or_buffers = (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 58a16a6d504..d73ad8225ca 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6840,7 +6840,7 @@ def to_parquet(
         statistics="ROWGROUP",
         metadata_file_path=None,
         int96_timestamps=False,
-        row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+        row_group_size_bytes=None,
         row_group_size_rows=None,
         max_page_size_bytes=None,
         max_page_size_rows=None,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 62be7378e9e..ce99f98b559 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -64,7 +64,7 @@ def _write_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -149,7 +149,7 @@ def write_to_dataset(
     return_metadata=False,
     statistics="ROWGROUP",
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -205,7 +205,7 @@ def write_to_dataset(
         If ``False``, timestamps will not be altered.
     row_group_size_bytes: integer or None, default None
         Maximum size of each stripe of the output.
-        If None, 134217728 (128MB) will be used.
+        If None, no limit on row group stripe size will be used.
     row_group_size_rows: integer or None, default None
         Maximum number of rows of each stripe of the output.
         If None, 1000000 will be used.
@@ -980,7 +980,7 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1627107b57d..1180da321e6 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -27,7 +27,7 @@
     fsspec_parquet = None
 
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
+_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -275,10 +275,9 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight of 1970-01-01.
     If ``False``, timestamps will not be altered.
-row_group_size_bytes: integer, default {row_group_size_bytes_val}
+row_group_size_bytes: integer, default None
     Maximum size of each stripe of the output.
-    If None, {row_group_size_bytes_val}
-    ({row_group_size_bytes_val_in_mb} MB) will be used.
+    If None, no limit on row group stripe size will be used.
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
@@ -346,10 +345,7 @@
 See Also
 --------
 cudf.read_parquet
-""".format(
-    row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-    row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
-)
+"""
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
 _docstring_merge_parquet_filemetadata = """
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index e793d4381d1..a781b8242fe 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,7 +23,6 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -341,9 +340,7 @@ def write_partition(
                 return_metadata=return_metadata,
                 statistics=kwargs.get("statistics", "ROWGROUP"),
                 int96_timestamps=kwargs.get("int96_timestamps", False),
-                row_group_size_bytes=kwargs.get(
-                    "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
-                ),
+                row_group_size_bytes=kwargs.get("row_group_size_bytes", None),
                 row_group_size_rows=kwargs.get("row_group_size_rows", None),
                 max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
                 max_page_size_rows=kwargs.get("max_page_size_rows", None),
@@ -365,7 +362,7 @@ def write_partition(
                     statistics=kwargs.get("statistics", "ROWGROUP"),
                     int96_timestamps=kwargs.get("int96_timestamps", False),
                     row_group_size_bytes=kwargs.get(
-                        "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                        "row_group_size_bytes", None
                     ),
                     row_group_size_rows=kwargs.get(
                         "row_group_size_rows", None

From e68f55c98f257bdeedeb31e68c9737264bd0b393 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:12:23 -0500
Subject: [PATCH 19/32] Refactor mixed_semi_join using cuco::static_set
 (#16230)

This PR refactors `mixed_semi_join` by replacing **cuco** legacy `static_map` with latest `static_set`.
Contributes to #12261.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16230
---
 cpp/src/join/join_common_utils.hpp       |  6 --
 cpp/src/join/mixed_join_common_utils.cuh | 33 +++++++++
 cpp/src/join/mixed_join_kernels_semi.cu  | 35 ++++-----
 cpp/src/join/mixed_join_kernels_semi.cuh |  6 +-
 cpp/src/join/mixed_join_semi.cu          | 90 +++++++-----------------
 cpp/tests/join/mixed_join_tests.cu       | 30 ++++++++
 6 files changed, 109 insertions(+), 91 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..89c13285cfe 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,38 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 4 threads for the hash set.
+auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type = cuco::static_set<size_type,
+                                       cuco::extent<size_t>,
+                                       cuda::thread_scope_device,
+                                       double_row_equality_comparator,
+                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
+                                       cudf::detail::cuco_allocator<char>,
+                                       cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..f2c5ff13638 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,12 +38,16 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile =
+    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -52,24 +56,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const outer_num_rows = left_table.num_rows();
+  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
+    // Make sure to swap_tables here as hash_set will use probe table as the left one.
+    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+    auto const set_ref_equality = set_ref.with_key_eq(equality);
+    auto const result           = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
   }
 }
 
@@ -78,9 +82,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +97,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +108,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index cfb785e242c..719b1d47105 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -46,45 +46,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -96,7 +57,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -137,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -156,27 +117,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -191,20 +145,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -212,18 +174,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -232,9 +195,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..08a0136700d 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +915,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}

From 42c53247bd3933c83fde18d378902a76d1506c57 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:42:09 -0500
Subject: [PATCH 20/32] Use CI workflow branch 'branch-24.10' again (#16832)

All RAPIDS libraries have been updated with Python 3.12 support, so
Python 3.12 changes
have been merged into `branch-24.10` of `shared-workflows`:
https://github.com/rapidsai/shared-workflows/pull/213

This updates GitHub Actions configs here to that branch.
---
 .github/workflows/build.yaml                  | 28 +++++------
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 24 +++++-----
 5 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d6d3e3fdd33..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index d670132cca9..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d7d14ea12ff..b515dbff9f3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index fe77ad4b6b2..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4af6a0d690d..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From a0c6fc8300bb713721c355feec21e43c83268b47 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 18 Sep 2024 20:52:23 -0700
Subject: [PATCH 21/32] Rename the NDS-H benchmark binaries (#16831)

Renames the NDS-H benchmark binaries with 0 prefixes for better lexicographical sorting

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16831
---
 cpp/benchmarks/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6c5f4a68a4c..abc6f74fccf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
 # ##################################################################################################
 # * nds-h benchmark --------------------------------------------------------------------------------
-ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
-ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp)
 
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------

From 30e3946ae79396b7fd09ea368fada0df4babea85 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 19 Sep 2024 01:44:30 -0400
Subject: [PATCH 22/32] Whitespace normalization of nested column coerced as
 string column in JSONL inputs (#16759)

Addresses #15280

Whitespace normalization is expected to remove unquoted whitespace characters in JSON lines inputs. However, in the cases where the JSON line is invalid due to an unquoted whitespace occurring in between numbers or literals, the existing normalization implementation is incorrect since it removes these invalidating whitespaces and makes the line valid.

This PR implements the normalization as a post-processing step on only nested columns forced as string columns.
Idea:
1. Create a single buffer by concatenating the rows of the string column. Create segment offsets and lengths array for concatenated buffer
2. Run a complementary whitespace normalization FST i.e. NOP for non-whitespace and quoted whitespace characters, and output indices of unquoted whitespace characters
3. Update segment lengths based on the number of output indices between segment offsets
4. Remove characters at output indices from concatenated buffer.
5. Return updated buffer, segment lengths and updated segment offsets

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16759
---
 cpp/include/cudf/io/detail/json.hpp           |  16 +-
 cpp/src/io/json/json_column.cu                | 149 +++++++++-----
 cpp/src/io/json/json_normalization.cu         | 165 ++++++++++++----
 cpp/src/io/json/nested_json_gpu.cu            |  10 +-
 cpp/src/io/json/read_json.cu                  |   6 -
 cpp/src/io/utilities/parsing_utils.cuh        |   6 +
 cpp/tests/io/json/json_test.cpp               |  43 +++++
 .../json_whitespace_normalization_test.cu     | 182 +++++++++---------
 8 files changed, 388 insertions(+), 189 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 73ff17b2b93..940d03cdb41 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
  * @param indata Input device buffer
+ * @param col_offsets Offsets to column contents in input buffer
+ * @param col_lengths Length of contents of each row in column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
+ *
+ * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents
+ * of each row
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr);
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8890c786287..756047d383a 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/detail/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -625,6 +626,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
+  // for columns that are not mixed type but have been forced as string
+  std::vector<bool> forced_as_string_column(num_columns);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
@@ -695,11 +698,14 @@ void make_device_json_column(device_span<SymbolT const> input,
     // Struct, List, String, Value
     auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    // if parent is mixed type column or this column is pruned, ignore this column.
+    // if parent is mixed type column or this column is pruned or if parent
+    // has been forced as string, ignore this column.
     if (parent_col_id != parent_node_sentinel &&
-        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+          (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) ||
+        forced_as_string_column[parent_col_id]) {
       ignore_vals[this_col_id] = 1;
       if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
+      if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; }
       continue;
     }
 
@@ -765,22 +771,26 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     auto this_column_category = column_categories[this_col_id];
-    if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct/list forced as string, and enforce it
-      auto const nt                             = tree_path.get_path(this_col_id);
-      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if ((column_categories[this_col_id] == NC_STRUCT or
-           column_categories[this_col_id] == NC_LIST) and
-          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
-        is_mixed_type_column[this_col_id] = 1;
-        this_column_category              = NC_STR;
-      }
+    // get path of this column, check if it is a struct/list forced as string, and enforce it
+    auto const nt                             = tree_path.get_path(this_col_id);
+    std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      this_column_category = NC_STR;
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
     initialize_json_columns(this_col_id, col, this_column_category);
+    if ((column_categories[this_col_id] == NC_STRUCT or
+         column_categories[this_col_id] == NC_LIST) and
+        user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
+      col.forced_as_string_column          = true;
+      forced_as_string_column[this_col_id] = true;
+    }
+
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
@@ -802,12 +812,30 @@ void make_device_json_column(device_span<SymbolT const> input,
           is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
-    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                    column_categories.data(),
-                    column_categories.size() * sizeof(column_categories[0]),
-                    cudaMemcpyDefault,
-                    stream.value());
+    cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                    column_categories.data(),
+                                    column_categories.size() * sizeof(column_categories[0]),
+                                    cudf::detail::host_memory_kind::PAGEABLE,
+                                    stream);
+  }
+
+  // ignore all children of columns forced as string
+  for (auto const this_col_id : unique_col_ids) {
+    auto parent_col_id = column_parent_ids[this_col_id];
+    if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) {
+      forced_as_string_column[this_col_id] = true;
+      ignore_vals[this_col_id]             = 1;
+    }
+    // Convert only mixed type columns as string (so to copy), but not its children
+    if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and
+        forced_as_string_column[this_col_id])
+      column_categories[this_col_id] = NC_STR;
   }
+  cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(),
+                                  column_categories.data(),
+                                  column_categories.size() * sizeof(column_categories[0]),
+                                  cudf::detail::host_memory_kind::PAGEABLE,
+                                  stream);
 
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
@@ -982,39 +1010,58 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                    "string offset, string length mismatch");
       rmm::device_uvector<char_length_pair_t> d_string_data(col_size, stream);
       // TODO how about directly storing pair<char*, size_t> in json_column?
-      auto offset_length_it =
-        thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin());
 
-      data_type target_type{};
+      auto [result_bitmask, null_count] = make_validity(json_col);
 
-      if (schema.has_value()) {
+      data_type target_type{};
+      std::unique_ptr<column> col{};
+      if (options.normalize_whitespace && json_col.forced_as_string_column) {
+        CUDF_EXPECTS(prune_columns || options.mixed_types_as_string,
+                     "Whitespace normalization of nested columns requested as string requires "
+                     "either prune_columns or mixed_types_as_string to be enabled");
+        auto [normalized_d_input, col_offsets, col_lengths] =
+          cudf::io::json::detail::normalize_whitespace(
+            d_input, json_col.string_offsets, json_col.string_lengths, stream, mr);
+        auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin());
+        target_type           = data_type{type_id::STRING};
+        // Convert strings to the inferred data type
+        col = parse_data(normalized_d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
+      } else {
+        auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(),
+                                                          json_col.string_lengths.begin());
+        if (schema.has_value()) {
 #ifdef NJP_DEBUG_PRINT
-        std::cout << "-> explicit type: "
-                  << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
-                                         : "n/a");
+          std::cout << "-> explicit type: "
+                    << (schema.has_value() ? std::to_string(static_cast<int>(schema->type.id()))
+                                           : "n/a");
 #endif
-        target_type = schema.value().type;
-      } else if (json_col.forced_as_string_column) {
-        target_type = data_type{type_id::STRING};
-      }
-      // Infer column type, if we don't have an explicit type for it
-      else {
-        target_type = cudf::io::detail::infer_data_type(
-          options.json_view(), d_input, offset_length_it, col_size, stream);
+          target_type = schema.value().type;
+        }
+        // Infer column type, if we don't have an explicit type for it
+        else {
+          target_type = cudf::io::detail::infer_data_type(
+            options.json_view(), d_input, offset_length_it, col_size, stream);
+        }
+        // Convert strings to the inferred data type
+        col = parse_data(d_input.data(),
+                         offset_length_it,
+                         col_size,
+                         target_type,
+                         std::move(result_bitmask),
+                         null_count,
+                         options.view(),
+                         stream,
+                         mr);
       }
 
-      auto [result_bitmask, null_count] = make_validity(json_col);
-      // Convert strings to the inferred data type
-      auto col = parse_data(d_input.data(),
-                            offset_length_it,
-                            col_size,
-                            target_type,
-                            std::move(result_bitmask),
-                            null_count,
-                            options.view(),
-                            stream,
-                            mr);
-
       // Reset nullable if we do not have nulls
       // This is to match the existing JSON reader's behaviour:
       // - Non-string columns will always be returned as nullable
@@ -1120,11 +1167,15 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
-    return get_tree_representation(tokens_gpu,
-                                   token_indices_gpu,
-                                   options.is_enabled_mixed_types_as_string(),
-                                   stream,
-                                   cudf::get_current_device_resource_ref());
+    // Note that to normalize whitespaces in nested columns coerced to be string, we need the column
+    // to either be of mixed type or we need to request the column to be returned as string by
+    // pruning it with the STRING dtype
+    return get_tree_representation(
+      tokens_gpu,
+      token_indices_gpu,
+      options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(),
+      stream,
+      cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 97d5884fef1..2d435dc8e1a 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -17,6 +17,7 @@
 #include "io/fst/lookup_tables.cuh"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -25,8 +26,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/atomic>
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/remove.h>
 
 #include <cstdlib>
 #include <string>
@@ -215,14 +225,6 @@ std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
  *        |   state is necessary to process escaped double-quote characters. Without this
  *        |   state, whitespaces following escaped double quotes inside strings may be removed.
  *
- * NOTE: An important case NOT handled by this FST is that of whitespace following newline
- * characters within a string. Consider the following example
- * Input:           {"a":"x\n y"}
- * FST output:      {"a":"x\ny"}
- * Expected output: {"a":"x\n y"}
- * Such strings are not part of the JSON standard (characters allowed within quotes should
- * have ASCII at least 0x20 i.e. space character and above) but may be encountered while
- * reading JSON files
  */
 enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
 // Aliases for readability of the transition table
@@ -255,17 +257,17 @@ struct TransduceToNormalizedWS {
     //      Let the alphabet set be Sigma
     // ---------------------------------------
     // ---------- NON-SPECIAL CASES: ----------
-    //      Output symbol same as input symbol <s>
+    //    Input symbol translates to output symbol
     // state | read_symbol <s>  -> output_symbol <s>
-    // DQS   | Sigma            -> Sigma
-    // OOS   | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
-    // DEC   | Sigma            -> Sigma
+    // DQS   | Sigma            -> <nop>
+    // OOS   | Sigma\{<SPC>,\t} -> <nop>
+    // DEC   | Sigma            -> <nop>
     // ---------- SPECIAL CASES: --------------
-    //    Input symbol translates to output symbol
-    // OOS   | {<SPC>}          -> <nop>
-    // OOS   | {\t}             -> <nop>
+    //      Output symbol same as input symbol <s>
+    // OOS   | {<SPC>}          -> {<SPC>}
+    // OOS   | {\t}             -> {\t}
 
-    // Case when read symbol is a space or tab but is unquoted
+    // Case when read symbol is not an unquoted space or tab
     // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
     // However, since there is no output in this case i.e. the count returned by
     // operator()(state_id, match_id, read_symbol) is zero, this function is never called.
@@ -287,8 +289,8 @@ struct TransduceToNormalizedWS {
                                                  SymbolT const read_symbol) const
   {
     // Case when read symbol is a space or tab but is unquoted
-    if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
-        state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
+    if (!(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
+          state_id == static_cast<StateT>(dfa_states::TT_OOS))) {
       return 0;
     }
     return 1;
@@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
-                          rmm::cuda_stream_view stream,
-                          rmm::device_async_resource_ref mr)
+std::
+  tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+  normalize_whitespace(device_span<char const> d_input,
+                       device_span<size_type const> col_offsets,
+                       device_span<size_type const> col_lengths,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr)
 {
-  CUDF_FUNC_RANGE();
-  static constexpr std::int32_t min_out = 0;
-  static constexpr std::int32_t max_out = 2;
+  /*
+   * Algorithm:
+    1. Create a single buffer by concatenating the rows of the string column. Create segment offsets
+   and lengths array for concatenated buffer
+    2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted
+   whitespace characters, and outputs indices of unquoted whitespace characters
+    3. Update segment lengths based on the number of output indices between segment offsets
+    4. Remove characters at output indices from concatenated buffer.
+    5. Return updated buffer, segment lengths and updated segment offsets
+   */
+  auto inbuf_lengths = cudf::detail::make_device_uvector_async(
+    col_lengths, stream, cudf::get_current_device_resource_ref());
+  size_t inbuf_lengths_size = inbuf_lengths.size();
+  size_type inbuf_size =
+    thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end());
+  rmm::device_uvector<char> inbuf(inbuf_size, stream);
+  rmm::device_uvector<size_type> inbuf_offsets(inbuf_lengths_size, stream);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  auto input_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char const*>(
+      [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__(
+        size_t i) -> char const* { return &d_input[col_offsets[i]]; }));
+  auto output_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    cuda::proclaim_return_type<char*>(
+      [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__(
+        size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; }));
+
+  {
+    // cub device batched copy
+    size_t temp_storage_bytes = 0;
+    cub::DeviceCopy::Batched(nullptr,
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+    rmm::device_buffer temp_storage(temp_storage_bytes, stream);
+    cub::DeviceCopy::Batched(temp_storage.data(),
+                             temp_storage_bytes,
+                             input_it,
+                             output_it,
+                             inbuf_lengths.begin(),
+                             inbuf_lengths_size,
+                             stream.value());
+  }
+
+  // whitespace normalization : get the indices of the unquoted whitespace characters
   auto parser =
     fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
                           fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                          fst::detail::make_translation_functor<SymbolT, 0, 2>(
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_buffer outbuf(indata.size(), stream, mr);
-  rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
-                   static_cast<SymbolOffsetT>(indata.size()),
-                   static_cast<SymbolT*>(outbuf.data()),
+  rmm::device_uvector<size_type> outbuf_indices(inbuf.size(), stream, mr);
+  rmm::device_scalar<SymbolOffsetT> outbuf_indices_size(stream, mr);
+  parser.Transduce(inbuf.data(),
+                   static_cast<SymbolOffsetT>(inbuf.size()),
                    thrust::make_discard_iterator(),
-                   outbuf_size.data(),
+                   outbuf_indices.data(),
+                   outbuf_indices_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
-  outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
-  std::swap(indata, outdata);
+  auto const num_deletions = outbuf_indices_size.value(stream);
+  outbuf_indices.resize(num_deletions, stream);
+
+  // now these indices need to be removed
+  // TODO: is there a better way to do this?
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    outbuf_indices.begin(),
+    outbuf_indices.end(),
+    [inbuf_offsets_begin = inbuf_offsets.begin(),
+     inbuf_offsets_end   = inbuf_offsets.end(),
+     inbuf_lengths       = inbuf_lengths.begin()] __device__(size_type idx) {
+      auto it  = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx);
+      auto pos = thrust::distance(inbuf_offsets_begin, it) - 1;
+      cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(inbuf_lengths + pos)};
+      ref.fetch_add(-1, cuda::std::memory_order_relaxed);
+    });
+
+  auto stencil = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    static_cast<std::size_t>(inbuf_size), stream, cudf::get_current_device_resource_ref());
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  thrust::make_constant_iterator(true),
+                  thrust::make_constant_iterator(true) + num_deletions,
+                  outbuf_indices.begin(),
+                  stencil.begin());
+  thrust::remove_if(rmm::exec_policy_nosync(stream),
+                    inbuf.begin(),
+                    inbuf.end(),
+                    stencil.begin(),
+                    thrust::identity<int>());
+  inbuf.resize(inbuf_size - num_deletions, stream);
+
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         inbuf_lengths.begin(),
+                         inbuf_lengths.end(),
+                         inbuf_offsets.begin(),
+                         0);
+
+  stream.synchronize();
+  return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4e513d3495c..1c15e147b13 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  parse_opts.dayfirst   = options.is_enabled_dayfirst();
-  parse_opts.keepquotes = options.is_enabled_keep_quotes();
-  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.dayfirst              = options.is_enabled_dayfirst();
+  parse_opts.keepquotes            = options.is_enabled_keep_quotes();
+  parse_opts.normalize_whitespace  = options.is_enabled_normalize_whitespace();
+  parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string();
+  parse_opts.trie_true             = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false            = cudf::detail::create_serialized_trie({"false"}, stream);
   std::vector<std::string> na_values{"", "null"};
   na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
   parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index bd82b040359..99a5b17bce8 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
     normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
-  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
-  // enabled, invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
-  }
-
   auto buffer =
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index bc2722441d0..734067582f7 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -67,6 +67,8 @@ struct parse_options_view {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::trie_view trie_true;
   cudf::detail::trie_view trie_false;
   cudf::detail::trie_view trie_na;
@@ -85,6 +87,8 @@ struct parse_options {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
+  bool normalize_whitespace;
+  bool mixed_types_as_string;
   cudf::detail::optional_trie trie_true;
   cudf::detail::optional_trie trie_false;
   cudf::detail::optional_trie trie_na;
@@ -111,6 +115,8 @@ struct parse_options {
             doublequote,
             dayfirst,
             skipblanklines,
+            normalize_whitespace,
+            mixed_types_as_string,
             cudf::detail::make_trie_view(trie_true),
             cudf::detail::make_trie_view(trie_false),
             cudf::detail::make_trie_view(trie_na),
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 960c19fce2e..48bc982d0e3 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren)
   }
 }
 
+TEST_F(JsonReaderTest, JsonDtypeSchema)
+{
+  std::string data = R"(
+    {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {"0": "lolol  "}, "c": true}
+    )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{{"c", {data_type{type_id::STRING}}},
+                                                               {"b", {data_type{type_id::STRING}}},
+                                                               {"a", {dtype<double>()}}};
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(dtype_schema)
+      .prune_columns(true)
+      .lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 3);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+
+  // cudf::column::contents contents = result.tbl->get_column(1).release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}",
+                                        "{\"0\": \"abc\"          }",
+                                        "{\"0\": \"lolol  \"}"}),
+    cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2),
+                                 cudf::test::strings_column_wrapper({"true", "false", "true"}),
+                                 cudf::test::debug_output_level::ALL_ERRORS);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 6d79fdc98ef..6a3bd69de81 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -34,129 +34,127 @@
 // Base test fixture for tests
 struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(std::string const& host_input, std::string const& expected_host_output)
-{
-  // Prepare cuda stream for data transfers & kernels
-  auto stream_view = cudf::test::get_default_stream();
-
-  auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
-
-  // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
-  cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, cudf::get_current_device_resource_ref());
-
-  std::string preprocessed_host_output(device_data.size(), 0);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_data.data(),
-                                preprocessed_host_output.size(),
-                                cudaMemcpyDeviceToHost,
-                                stream_view.value()));
-
-  stream_view.synchronize();
-  ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
-  CUDF_TEST_EXPECT_VECTOR_EQUAL(
-    preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
-}
-
-TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption)
 {
-  std::string input  = R"({ "A" : "TEST" })";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces)
-{
-  std::string input  = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})";
-  std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString)
-{
-  std::string input  = R"({" a ":50})";
-  std::string output = R"({" a ":50})";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString)
-{
-  std::string input  = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}";
-  std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs)
-{
-  std::string input  = "{\"a\":\t\"b\"}";
-  std::string output = R"({"a":"b"})";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows)
 {
-  std::string input  = "{\"A\" : \t\"TEST\" }";
-  std::string output = R"({"A":"TEST"})";
-  run_test(input, output);
-}
+  // When mixed type fields are read as strings, the table read will differ depending the
+  // value of normalize_whitespace
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs)
-{
-  std::string input =
-    "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, "
-    "\"bar\trapids\": 456 }";
-  std::string output =
-    "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}";
-  run_test(input, output);
-}
+  // Test input
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" } ] } }
+  )";
+  cudf::io::json_reader_options input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{host_input.data(), host_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample)
-{
-  std::string input  = R"([{"a":50}, {"a" : 60}])";
-  std::string output = R"([{"a":50},{"a":60}])";
-  run_test(input, output);
-}
+  cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired)
-{
-  std::string input  = R"({"a\\n\r\a":50})";
-  std::string output = R"({"a\\n\r\a":50})";
-  run_test(input, output);
-}
+  // Expected table
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"}] } }
+  )";
+  cudf::io::json_reader_options expected_input_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .lines(true)
+      .mixed_types_as_string(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
-TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput)
-{
-  std::string input  = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}";
-  std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}";
-  run_test(input, output);
+  cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());
 }
 
-TEST_F(JsonWSNormalizationTest, ReadJsonOption)
+TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType)
 {
   // When mixed type fields are read as strings, the table read will differ depending the
   // value of normalize_whitespace
 
   // Test input
-  std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}";
+  std::string const host_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [ { "EE": "efg" }, { "YY" :   "abc" }    ] } }
+  { "Root": { "Key": [  { "YY" :   "abc" }    ] } }
+  )";
+
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
+    {"Key", {cudf::data_type{cudf::type_id::STRING}}}};
+
   cudf::io::json_reader_options input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{host_input.data(), host_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(true);
+      .prune_columns(true)
+      .normalize_whitespace(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options);
 
   // Expected table
-  std::string const expected_input = R"({ "a" : {"b":"c"}})";
+  std::string const expected_input = R"(
+  { "Root": { "Key": [ { "EE": tr ue } , { "YY" :    2 } ] } }
+  { "Root": { "Key": [ { "EE": 12 34 } ] } }
+  { "Root": { "Key": [{"YY":1}] } }
+  { "Root": { "Key": [ { "EE": 12. 34 } ] } }
+  { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } }
+  { "Root": { "Key": [{"YY":"abc"}] } }
+  )";
+
   cudf::io::json_reader_options expected_input_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{expected_input.data(), expected_input.size()})
+      .dtypes(dtype_schema)
       .lines(true)
-      .mixed_types_as_string(true)
-      .normalize_whitespace(false);
+      .prune_columns(true)
+      .normalize_whitespace(false)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
 
   cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options);
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view());

From dafb3e7559710d5af7118a206312f250eb671558 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 19 Sep 2024 12:06:53 -0500
Subject: [PATCH 23/32] Generate GPU vs CPU usage metrics per pytest file in
 pandas testsuite for `cudf.pandas` (#16739)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR introduces GPU and CPU usage reporting to cudf.pandas pytest suite and the generated metrics will be available for viewing in the existing pandas pytest summary page:
https://github.com/rapidsai/cudf/actions/runs/10886370333/attempts/1#summary-30220192117

![Screenshot 2024-09-16 at 2 39 07 PM](https://github.com/user-attachments/assets/6d31c7d2-8a27-4f02-bf9d-c1b40ad1d756)


Note: I'm aware of cases of where both GPU and CPU usage show 0%, which is due to various reasons that I'm working on addressing in a follow-up PR.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/16739
---
 .../pandas-tests/job-summary.py               | 14 ++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 16 +++++
 .../cudf/pandas/scripts/conftest-patch.py     | 59 ++++++++++++++++++-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  5 +-
 .../pandas/scripts/summarize-test-results.py  | 40 +++++++++++++
 5 files changed, 128 insertions(+), 6 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 93a815838b7..7a12db927e5 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,8 +68,18 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
+total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
+pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
+pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)
 
-pr_df = pr_df[["total", "passed", "failed", "skipped"]]
+cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
+gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
+
+# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
+pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
+pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'
+
+pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
@@ -95,6 +105,8 @@ def emoji_failed(x):
 
 print(comment)
 print()
+print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
+print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
 print(df.to_markdown())
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index afa1ce5f86c..bf2ee6ae624 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+def _fast_function_call():
+    """
+    Placeholder fast function for pytest profiling purposes.
+    """
+    return None
+
+
+def _slow_function_call():
+    """
+    Placeholder slow function for pytest profiling purposes.
+    """
+    return None
+
+
 def _fast_slow_function_call(
     func: Callable,
     /,
@@ -910,6 +924,7 @@ def _fast_slow_function_call(
                 # try slow path
                 raise Exception()
             fast = True
+            _fast_function_call()
             if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
                 try:
                     with nvtx.annotate(
@@ -952,6 +967,7 @@ def _fast_slow_function_call(
                 from ._logger import log_fallback
 
                 log_fallback(slow_args, slow_kwargs, err)
+            _slow_function_call()
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 505a40b0bfa..d12d2697729 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1,10 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import json
 import os
 import sys
+import traceback
+from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -36,4 +39,58 @@ def patch_testing_functions():
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
+# Dictionary to store function call counts
+function_call_counts = {}  # type: ignore
+
+# The specific functions to track
+FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
+
+
+def find_pytest_file(frame):
+    stack = traceback.extract_stack()
+    absolute_paths = [frame.filename for frame in stack]
+    for file in absolute_paths:
+        if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
+            -1
+        ].startswith("test_"):
+            return str(file).rsplit("pandas-tests/", 1)[-1]
+    return None
+
+
+def trace_calls(frame, event, arg):
+    if event != "call":
+        return
+    code = frame.f_code
+    func_name = code.co_name
+
+    if func_name in FUNCTION_NAME:
+        filename = find_pytest_file(frame)
+        if filename is None:
+            return
+        if filename not in function_call_counts:
+            function_call_counts[filename] = defaultdict(int)
+        function_call_counts[filename][func_name] += 1
+
+
+def pytest_sessionstart(session):
+    # Set the profile function to trace calls
+    sys.setprofile(trace_calls)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # Remove the profile function
+    sys.setprofile(None)
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_unconfigure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker, write the counts before exiting
+        worker_id = config.workerinput["workerid"]
+        output_file = f"function_call_counts_worker_{worker_id}.json"
+        with open(output_file, "w") as f:
+            json.dump(function_call_counts, f, indent=4)
+        print(f"Function call counts have been written to {output_file}")
+
+
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9c65b74d081..9b9ce026571 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -64,8 +64,6 @@ markers = [
   "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
-    # append the contents of patch-confest.py to conftest.py
-    cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py
 
     # Substitute `pandas.tests` with a relative import.
     # This will depend on the location of the test module relative to
@@ -137,7 +135,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
@@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
 
 mv *.json ..
 cd ..
-
 rm -rf pandas-testing/pandas-tests/
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index ffd2abb960d..4ea0b3b4413 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -12,7 +12,9 @@
 """
 
 import argparse
+import glob
 import json
+import os
 
 from rich.console import Console
 from rich.table import Table
@@ -57,6 +59,44 @@ def get_per_module_results(log_file_name):
                 per_module_results[module_name].setdefault(outcome, 0)
                 per_module_results[module_name]["total"] += 1
                 per_module_results[module_name][outcome] += 1
+
+    directory = os.path.dirname(log_file_name)
+    pattern = os.path.join(directory, "function_call_counts_worker_*.json")
+    matching_files = glob.glob(pattern)
+    function_call_counts = {}
+
+    for file in matching_files:
+        with open(file) as f:
+            function_call_count = json.load(f)
+        if not function_call_counts:
+            function_call_counts.update(function_call_count)
+        else:
+            for key, value in function_call_count.items():
+                if key not in function_call_counts:
+                    function_call_counts[key] = value
+                else:
+                    if "_slow_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_slow_function_call"] = 0
+                    if "_fast_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_fast_function_call"] = 0
+                    function_call_counts[key]["_slow_function_call"] += (
+                        value.get("_slow_function_call", 0)
+                    )
+                    function_call_counts[key]["_fast_function_call"] += (
+                        value.get("_fast_function_call", 0)
+                    )
+
+    for key, value in per_module_results.items():
+        if key in function_call_counts:
+            per_module_results[key]["_slow_function_call"] = (
+                function_call_counts[key].get("_slow_function_call", 0)
+            )
+            per_module_results[key]["_fast_function_call"] = (
+                function_call_counts[key].get("_fast_function_call", 0)
+            )
+        else:
+            per_module_results[key]["_slow_function_call"] = 0
+            per_module_results[key]["_fast_function_call"] = 0
     return per_module_results
 
 

From 8782a1d63e82ee20964e36ef885af6b36f75732c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 19 Sep 2024 10:20:55 -0700
Subject: [PATCH 24/32] Improve aggregation documentation (#16822)

This PR fixes several documentation issues uncovered while working on #16619. There are no actual code changes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16822
---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index b257eef1e9e..4255faea702 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  *
  * @tparam F Type of callable
  * @param k The `aggregation::Kind` value to dispatch
- * aram f The callable that accepts an `aggregation::Kind` non-type template
- * argument.
+ * @param f The callable that accepts an `aggregation::Kind` callable function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * @return Forwards the return value of the callable.
  */
@@ -1626,6 +1625,7 @@ struct dispatch_source {
  * parameter of the callable `F`
  * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind`
  * non-type template parameter for the second template parameter of the callable
+ * @param f The callable that accepts `data_type` and `aggregation::Kind` function object.
  * @param args Parameter pack forwarded to the `operator()` invocation
  * `F`.
  */
@@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d
  * @brief Returns the target `data_type` for the specified aggregation  k
  * performed on elements of type  source_type.
  *
- * aram source_type The element type to be aggregated
- * aram k The aggregation
+ * @param source_type The element type to be aggregated
+ * @param k The aggregation kind
  * @return data_type The target_type of  k performed on  source_type
  * elements
  */

From e9b5b538d515219ea36ec62f31ff78424e1fcf89 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:36:55 -1000
Subject: [PATCH 25/32] Add string.repeats API to pylibcudf (#16834)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16834
---
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../api_docs/pylibcudf/strings/repeat.rst     |  6 +++
 python/cudf/cudf/_lib/strings/repeat.pyx      | 40 +++++----------
 .../pylibcudf/libcudf/strings/repeat.pxd      |  8 +--
 .../pylibcudf/strings/CMakeLists.txt          |  2 +-
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 python/pylibcudf/pylibcudf/strings/repeat.pxd | 10 ++++
 python/pylibcudf/pylibcudf/strings/repeat.pyx | 51 +++++++++++++++++++
 .../pylibcudf/tests/test_string_repeat.py     | 20 ++++++++
 9 files changed, 106 insertions(+), 33 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/repeat.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_repeat.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 462a756a092..1200ecba5d9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -10,5 +10,6 @@ strings
     find
     regex_flags
     regex_program
+    repeat
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
new file mode 100644
index 00000000000..0041fe4c3da
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst
@@ -0,0 +1,6 @@
+======
+repeat
+======
+
+.. automodule:: pylibcudf.strings.repeat
+   :members:
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 42fcfa5d94e..43649d4defe 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,17 +1,12 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def repeat_scalar(Column source_strings,
@@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats
+    )
+    return Column.from_pylibcudf(plc_result)
 
 
 @acquire_spill_lock()
@@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings,
     each string in `source_strings`
     `repeats` number of times.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repeats_view = repeats.view()
-
-    with nogil:
-        c_result = move(cpp_repeat.repeat_strings(
-            source_view,
-            repeats_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_result = plc.strings.repeat.repeat_strings(
+        source_strings.to_pylibcudf(mode="read"),
+        repeats.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_result)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 410ff58f299..59262820411 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
         nogil:
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        size_type repeat) except +
+        column_view input,
+        size_type repeat_times) except +
 
     cdef unique_ptr[column] repeat_strings(
-        column_view strings,
-        column_view repeats) except +
+        column_view input,
+        column_view repeat_times) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index b499a127541..457e462e3cf 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx slice.pyx
+                   regex_program.pyx repeat.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index ef102aff2af..250cefedf55 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -8,6 +8,7 @@
     find,
     regex_flags,
     regex_program,
+    repeat,
     replace,
     slice,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd
new file mode 100644
index 00000000000..bc70926b6fa
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnorSizeType:
+    Column
+    size_type
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times)
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
new file mode 100644
index 00000000000..5f627218f6e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times):
+    """
+    Repeat each string in the given strings column by the numbers
+    of times given in another numeric column.
+
+    For details, see :cpp:func:`cudf::strings::repeat`.
+
+    Parameters
+    ----------
+    input : Column
+        The column containing strings to repeat.
+    repeat_times : Column or int
+        Number(s) of times that the corresponding input strings
+        for each row are repeated.
+
+    Returns
+    -------
+    Column
+        New column containing the repeated strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    if ColumnorSizeType is Column:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times.view()
+                )
+            )
+    elif ColumnorSizeType is size_type:
+        with nogil:
+            c_result = move(
+                cpp_repeat.repeat_strings(
+                    input.view(),
+                    repeat_times
+                )
+            )
+    else:
+        raise ValueError("repeat_times must be size_type or integer")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
new file mode 100644
index 00000000000..18b5d8bf4d0
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+
+
+@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
+def test_repeat_strings(repeats):
+    arr = pa.array(["1", None])
+    plc_result = plc.strings.repeat.repeat_strings(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(repeats)
+        if not isinstance(repeats, int)
+        else repeats,
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(pc.binary_repeat(arr, repeats))
+    assert result.equals(expected)

From 51c2dd6f05f9c9d07f6e07b0119906e1ea32fc2d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:38:48 -1000
Subject: [PATCH 26/32] Add string.contains APIs to pylibcudf (#16814)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16814
---
 python/cudf/cudf/_lib/strings/contains.pyx    |  80 ++---------
 .../pylibcudf/libcudf/strings/contains.pxd    |   7 +-
 .../pylibcudf/pylibcudf/strings/contains.pxd  |  14 ++
 .../pylibcudf/pylibcudf/strings/contains.pyx  | 130 +++++++++++++++++-
 .../pylibcudf/tests/test_string_contains.py   |  37 +++++
 5 files changed, 199 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82f5e06c547..03b4887f200 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,27 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.contains cimport (
-    count_re as cpp_count_re,
-    like as cpp_like,
-    matches_re as cpp_matches_re,
-)
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 from pylibcudf.strings import contains
 from pylibcudf.strings.regex_program import RegexProgram
@@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with count of occurrences of `reg_ex` in
     each string of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_count_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column with each value True if the string matches `reg_ex`
     regular expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_matches_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
@@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
     Returns a Column with each value True if the string matches the
     `py_pattern` like expression with each record of `source_strings`
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef DeviceScalar pattern = py_pattern.device_value
-    cdef DeviceScalar escape = py_escape.device_value
-
-    cdef const string_scalar* scalar_ptn = <const string_scalar*>(
-        pattern.get_raw_ptr()
-    )
-    cdef const string_scalar* scalar_esc = <const string_scalar*>(
-        escape.get_raw_ptr()
+    plc_column = contains.like(
+        source_strings.to_pylibcudf(mode="read"),
+        py_pattern.device_value.c_value,
+        py_escape.device_value.c_value,
     )
-
-    with nogil:
-        c_result = move(cpp_like(
-            source_view,
-            scalar_ptn[0],
-            scalar_esc[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index c2fb5f0dce4..eac0f748257 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] like(
         column_view source_strings,
         string_scalar pattern,
-        string_scalar escape) except +
+        string_scalar escape_character) except +
+
+    cdef unique_ptr[column] like(
+        column_view source_strings,
+        column_view patterns,
+        string_scalar escape_character) except +
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
index 2cd4891a0ea..6146a1119d6 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -1,7 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_program cimport RegexProgram
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Column contains_re(Column input, RegexProgram prog)
+
+cpdef Column count_re(Column input, RegexProgram prog)
+
+cpdef Column matches_re(Column input, RegexProgram prog)
+
+cpdef Column like(
+    Column input,
+    ColumnOrScalar pattern,
+    Scalar escape_character = *
+)
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 1a2446f6e2c..82bd1fbea32 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,8 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from cython.operator import dereference
+
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
 from pylibcudf.libcudf.strings cimport contains as cpp_contains
 from pylibcudf.strings.regex_program cimport RegexProgram
 
@@ -32,9 +38,131 @@ cpdef Column contains_re(
     cdef unique_ptr[column] result
 
     with nogil:
-        result = cpp_contains.contains_re(
+        result = move(cpp_contains.contains_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column count_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns the number of times the given regex_program's pattern
+    matches in each string.
+
+    For details, see :cpp:func:`cudf::strings::count_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of match counts for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column matches_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns a boolean column identifying rows which
+    matching the given regex_program object but only at
+    the beginning the string.
+
+    For details, see :cpp:func:`cudf::strings::matches_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_contains.matches_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        ))
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
+    """
+    Returns a boolean column identifying rows which
+    match the given like pattern.
+
+    For details, see :cpp:func:`cudf::strings::like`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    pattern : Column or Scalar
+        Like patterns to match within each string
+    escape_character : Scalar
+        Optional character specifies the escape prefix.
+        Default is no escape character.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+    cdef unique_ptr[column] result
+
+    if escape_character is None:
+        escape_character = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
         )
 
+    cdef const string_scalar* c_escape_character = <const string_scalar*>(
+        escape_character.c_obj.get()
+    )
+    cdef const string_scalar* c_pattern
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                pattern.view(),
+                dereference(c_escape_character)
+            ))
+    elif ColumnOrScalar is Scalar:
+        c_pattern = <const string_scalar*>(pattern.c_obj.get())
+        with nogil:
+            result = move(cpp_contains.like(
+                input.view(),
+                dereference(c_pattern),
+                dereference(c_escape_character)
+            ))
+    else:
+        raise ValueError("pattern must be a Column or a Scalar")
+
     return Column.from_libcudf(move(result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4f88e09183f..4e4dd7cbb00 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
+
+
+def test_count_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["A1a2A3a4", "A1A2A3", None])
+    result = plc.strings.contains.count_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.count_substring_regex(arr, pattern)
+    assert_column_eq(result, expected)
+
+
+def test_match_re():
+    pattern = "[1-9][a-z]"
+    arr = pa.array(["1a2b", "b1a2", None])
+    result = plc.strings.contains.matches_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    expected = pc.match_substring_regex(arr, f"^{pattern}")
+    assert_column_eq(result, expected)
+
+
+def test_like():
+    pattern = "%a"
+    arr = pa.array(["1a2aa3aaa"])
+    result = plc.strings.contains.like(
+        plc.interop.from_arrow(arr),
+        plc.interop.from_arrow(pa.array([pattern])),
+    )
+    expected = pc.match_like(arr, pattern)
+    assert_column_eq(result, expected)

From 7233da9c38e374ad6be6ebcc13ea8bd209c8a496 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 07:59:03 -1000
Subject: [PATCH 27/32] Remove `MultiIndex._poplevel` inplace implementation.
 (#16767)

`MultiIndex._poplevel`, which backs `MultiIndex.droplevel`, operates by dropping a given level inplace. There 2 places where `._poplevel` is called, and both usages makes a shallow copy of the data first, presumably to work around side effects of this inplace behavior.

This PR remove the `MultiIndex._poplevel` implementation and just implements dropping level like behavior by just returning a new object.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16767
---
 python/cudf/cudf/core/multiindex.py | 111 ++++++++++++----------------
 python/cudf/cudf/core/reshape.py    |  26 +++++--
 2 files changed, 65 insertions(+), 72 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e00890ac5c3..b86ad38c944 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Generator, Hashable
 
     from typing_extensions import Self
 
@@ -1041,9 +1041,11 @@ def to_frame(
         )
 
     @_performance_tracking
-    def get_level_values(self, level) -> cudf.Index:
+    def _level_to_ca_label(self, level) -> tuple[Hashable, int]:
         """
-        Return the values at the requested level
+        Convert a level to a ColumAccessor label and an integer position.
+
+        Useful if self._column_names != self.names.
 
         Parameters
         ----------
@@ -1051,10 +1053,13 @@ def get_level_values(self, level) -> cudf.Index:
 
         Returns
         -------
-        An Index containing the values at the requested level.
+        tuple[Hashable, int]
+            (ColumnAccessor label corresponding to level, integer position of the level)
         """
-        colnames = self._data.names
-        if level not in colnames:
+        colnames = self._column_names
+        try:
+            level_idx = colnames.index(level)
+        except ValueError:
             if isinstance(level, int):
                 if level < 0:
                     level = level + len(colnames)
@@ -1067,8 +1072,22 @@ def get_level_values(self, level) -> cudf.Index:
                 level = colnames[level_idx]
             else:
                 raise KeyError(f"Level not found: '{level}'")
-        else:
-            level_idx = colnames.index(level)
+        return level, level_idx
+
+    @_performance_tracking
+    def get_level_values(self, level) -> cudf.Index:
+        """
+        Return the values at the requested level
+
+        Parameters
+        ----------
+        level : int or label
+
+        Returns
+        -------
+        An Index containing the values at the requested level.
+        """
+        level, level_idx = self._level_to_ca_label(level)
         level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
@@ -1420,57 +1439,6 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_performance_tracking
-    def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
-        """
-        Remove and return the specified levels from self.
-
-        Parameters
-        ----------
-        level : level name or index, list
-            One or more levels to remove
-
-        Returns
-        -------
-        Index composed of the removed levels. If only a single level
-        is removed, a flat index is returned. If no levels are specified
-        (empty list), None is returned.
-        """
-        if not pd.api.types.is_list_like(level):
-            level = (level,)
-
-        ilevels = sorted(self._level_index_from_level(lev) for lev in level)
-
-        if not ilevels:
-            return None
-
-        popped_data = {}
-        popped_names = []
-        names = list(self.names)
-
-        # build the popped data and names
-        for i in ilevels:
-            n = self._data.names[i]
-            popped_data[n] = self._data[n]
-            popped_names.append(self.names[i])
-
-        # pop the levels out from self
-        # this must be done iterating backwards
-        for i in reversed(ilevels):
-            n = self._data.names[i]
-            names.pop(i)
-            popped_data[n] = self._data.pop(n)
-
-        # construct the popped result
-        popped = cudf.core.index._index_from_data(popped_data)
-        popped.names = popped_names
-
-        # update self
-        self.names = names
-        self._levels, self._codes = _compute_levels_and_codes(self._data)
-
-        return popped
-
     @_performance_tracking
     def swaplevel(self, i=-2, j=-1) -> Self:
         """
@@ -1523,7 +1491,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
         return midx
 
     @_performance_tracking
-    def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
+    def droplevel(self, level=-1) -> Self | cudf.Index:
         """
         Removes the specified levels from the MultiIndex.
 
@@ -1578,11 +1546,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
         >>> idx.droplevel(["first", "second"])
         Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third')
         """
-        mi = self.copy(deep=False)
-        mi._poplevels(level)
-        if mi.nlevels == 1:
-            return mi.get_level_values(mi.names[0])
+        if is_scalar(level):
+            level = (level,)
+        elif len(level) == 0:
+            return self
+
+        new_names = list(self.names)
+        new_data = self._data.copy(deep=False)
+        for i in sorted(
+            (self._level_index_from_level(lev) for lev in level), reverse=True
+        ):
+            new_names.pop(i)
+            new_data.pop(self._data.names[i])
+
+        if len(new_data) == 1:
+            return cudf.core.index._index_from_data(new_data)
         else:
+            mi = MultiIndex._from_data(new_data)
+            mi.names = new_names
             return mi
 
     @_performance_tracking
@@ -1886,7 +1867,7 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    def _level_index_from_level(self, level):
+    def _level_index_from_level(self, level) -> int:
         """
         Return level index from given level name or index
         """
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c026579b8b5..c951db00c9a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,6 +12,7 @@
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
+from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
 from cudf.core.column_accessor import ColumnAccessor
@@ -1227,13 +1228,24 @@ def unstack(df, level, fill_value=None, sort: bool = True):
         )
         return res
     else:
-        df = df.copy(deep=False)
-        columns = df.index._poplevels(level)
-        index = df.index
-    result = _pivot(df, index, columns)
-    if result.index.nlevels == 1:
-        result.index = result.index.get_level_values(result.index.names[0])
-    return result
+        index = df.index.droplevel(level)
+        if is_scalar(level):
+            columns = df.index.get_level_values(level)
+        else:
+            new_names = []
+            ca_data = {}
+            for lev in level:
+                ca_level, level_idx = df.index._level_to_ca_label(lev)
+                new_names.append(df.index.names[level_idx])
+                ca_data[ca_level] = df.index._data[ca_level]
+            columns = type(df.index)._from_data(
+                ColumnAccessor(ca_data, verify=False)
+            )
+            columns.names = new_names
+        result = _pivot(df, index, columns)
+        if result.index.nlevels == 1:
+            result.index = result.index.get_level_values(result.index.names[0])
+        return result
 
 
 def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:

From 272a70307017c95805d9a7ae77e66b836afccc7b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 11:05:00 -1000
Subject: [PATCH 28/32] Add string.extract APIs to pylibcudf (#16823)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16823
---
 .../api_docs/pylibcudf/strings/extract.rst    |  6 ++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 python/cudf/cudf/_lib/strings/extract.pyx     | 34 ++-------
 python/cudf/cudf/core/column/string.py        |  6 +-
 .../pylibcudf/libcudf/strings/extract.pxd     |  8 +-
 .../pylibcudf/strings/CMakeLists.txt          |  4 +-
 .../pylibcudf/pylibcudf/strings/__init__.pxd  |  1 +
 .../pylibcudf/pylibcudf/strings/__init__.py   |  1 +
 .../pylibcudf/pylibcudf/strings/extract.pxd   | 10 +++
 .../pylibcudf/pylibcudf/strings/extract.pyx   | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_string_extract.py    | 38 ++++++++++
 11 files changed, 149 insertions(+), 36 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
 create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pxd
 create mode 100644 python/pylibcudf/pylibcudf/strings/extract.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_extract.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
new file mode 100644
index 00000000000..06f74a38709
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst
@@ -0,0 +1,6 @@
+=======
+extract
+=======
+
+.. automodule:: pylibcudf.strings.extract
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 1200ecba5d9..2518afc80a7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -7,6 +7,7 @@ strings
     capitalize
     char_types
     contains
+    extract
     find
     regex_flags
     regex_program
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 63f4d57e562..5bf336f4f3c 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,21 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.table.table cimport table
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport data_from_unique_ptr
+
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
@@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags):
     The returning data contains one row for each subject string,
     and one column for each group.
     """
-    cdef unique_ptr[table] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_extract(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
+    prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags)
+    plc_result = plc.strings.extract.extract(
+        source_strings.to_pylibcudf(mode="read"), prog
     )
+    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns()))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index e059917b0b8..4463e3280df 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -623,11 +623,9 @@ def extract(
                 "unsupported value for `flags` parameter"
             )
 
-        data, _ = libstrings.extract(self._column, pat, flags)
+        data = libstrings.extract(self._column, pat, flags)
         if len(data) == 1 and expand is False:
-            data = next(iter(data.values()))
-        else:
-            data = data
+            _, data = data.popitem()
         return self._return_or_inplace(data, expand=expand)
 
     def contains(
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
index 12cd628fc1f..b7166167cfd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table
 cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
+        column_view input,
+        regex_program prog) except +
+
+    cdef unique_ptr[column] extract_all_record(
+        column_view input,
+        regex_program prog) except +
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 457e462e3cf..d3065cf8667 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx repeat.pyx replace.pyx slice.pyx
+set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx
+                   regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index d1f632d6d8e..6848c8e6e86 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -5,6 +5,7 @@ from . cimport (
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index 250cefedf55..bba86e818cc 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -5,6 +5,7 @@
     case,
     char_types,
     contains,
+    extract,
     find,
     regex_flags,
     regex_program,
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd
new file mode 100644
index 00000000000..3871f5a0e4e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog)
+
+cpdef Column extract_all_record(Column input, RegexProgram prog)
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
new file mode 100644
index 00000000000..dcb11ca10ce
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport extract as cpp_extract
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.table cimport Table
+
+
+cpdef Table extract(Column input, RegexProgram prog):
+    """
+    Returns a table of strings columns where each column
+    corresponds to the matching group specified in the given
+    egex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Table
+        Columns of strings extracted from the input column.
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
+
+
+cpdef Column extract_all_record(Column input, RegexProgram prog):
+    """
+    Returns a lists column of strings where each string column
+    row corresponds to the matching group specified in the given
+    regex_program object.
+
+    For details, see :cpp:func:`cudf::strings::extract_all_record`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    Column
+        Lists column containing strings extracted from the input column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_extract.extract_all_record(
+                input.view(),
+                prog.c_obj.get()[0]
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
new file mode 100644
index 00000000000..788b86423c4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+
+
+def test_extract():
+    pattern = "([ab])(\\d)"
+    pa_pattern = "(?P<letter>[ab])(?P<digit>\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pc.extract_regex(arr, pa_pattern)
+    for i, result_col in enumerate(result.itercolumns()):
+        expected_col = pa.chunked_array(expected.field(i))
+        assert result_col.fill_null("").equals(expected_col)
+
+
+def test_extract_all_record():
+    pattern = "([ab])(\\d)"
+    arr = pa.array(["a1", "b2", "c3"])
+    plc_result = plc.strings.extract.extract_all_record(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+    )
+    result = plc.interop.to_arrow(plc_result)
+    expected = pa.chunked_array(
+        [pa.array([["a", "1"], ["b", "2"], None], type=result.type)]
+    )
+    assert result.equals(expected)

From 8e1345faef8db194828feacd8f6446b358fc07ae Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 19 Sep 2024 18:08:42 -0400
Subject: [PATCH 29/32] Intentionally leak thread_local CUDA resources to avoid
 crash (part 1) (#16787)

The NVbench application `PARQUET_READER_NVBENCH` in libcudf currently crashes with the segmentation fault. To reproduce:

```
./PARQUET_READER_NVBENCH -d 0 -b 1 --run-once -a io_type=FILEPATH -a compression_type=SNAPPY -a cardinality=0 -a run_length=1
```

The root cause is that some (1) `thread_local`  objects on the main thread in `libcudf` and (2) `static` objects in `kvikio` are destroyed after `cudaDeviceReset()` in NVbench and upon program termination. These objects should simply be leaked, since their destructors making CUDA calls upon program termination constitutes UB in CUDA.

This simple PR is the cuDF side of the fix. The other part is done here https://github.com/rapidsai/kvikio/pull/462.

closes #13229

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16787
---
 cpp/src/utilities/stream_pool.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d3a7ce5a4e..9824c472b20 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -132,6 +132,13 @@ struct cuda_event {
   cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); }
   virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); }
 
+  // Moveable but not copyable.
+  cuda_event(const cuda_event&)            = delete;
+  cuda_event& operator=(const cuda_event&) = delete;
+
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+
   operator cudaEvent_t() { return e_; }
 
  private:
@@ -147,11 +154,12 @@ struct cuda_event {
  */
 cudaEvent_t event_for_thread()
 {
-  thread_local std::vector<std::unique_ptr<cuda_event>> thread_events(get_num_cuda_devices());
+  // The program may crash if this function is called from the main thread and user application
+  // subsequently calls cudaDeviceReset().
+  // As a workaround, here we intentionally disable RAII and leak cudaEvent_t.
+  thread_local std::vector<cuda_event*> thread_events(get_num_cuda_devices());
   auto const device_id = get_current_cuda_device();
-  if (not thread_events[device_id.value()]) {
-    thread_events[device_id.value()] = std::make_unique<cuda_event>();
-  }
+  if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); }
   return *thread_events[device_id.value()];
 }
 

From d63ca6a90059a7c956de1eee0b60feba9059375e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 19 Sep 2024 13:52:16 -1000
Subject: [PATCH 30/32] Access Frame attributes instead of ColumnAccessor
 attributes when available (#16652)

There are some places where a public object like `DataFrame` or `Index` accesses a `ColumnAccessor` attribute when it's accessible in a shared subclass attribute instead (like `Frame`).

In an effort to access the `ColumnAccessor` less, replaced usages of `._data.attribute` with a `Frame` specific attribute`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16652
---
 python/cudf/cudf/_lib/concat.pyx           |   4 +-
 python/cudf/cudf/_lib/copying.pyx          |   2 +-
 python/cudf/cudf/_lib/csv.pyx              |   2 +-
 python/cudf/cudf/_lib/io/utils.pyx         |   2 +-
 python/cudf/cudf/_lib/parquet.pyx          |  12 +--
 python/cudf/cudf/_lib/utils.pyx            |   6 +-
 python/cudf/cudf/core/_base_index.py       |   2 +-
 python/cudf/cudf/core/column_accessor.py   |  24 ++---
 python/cudf/cudf/core/dataframe.py         | 100 ++++++++++-----------
 python/cudf/cudf/core/frame.py             |  52 ++++++-----
 python/cudf/cudf/core/groupby/groupby.py   |  23 ++---
 python/cudf/cudf/core/index.py             |  20 ++++-
 python/cudf/cudf/core/indexed_frame.py     |  39 ++++----
 python/cudf/cudf/core/join/join.py         |  18 ++--
 python/cudf/cudf/core/multiindex.py        |  44 ++++-----
 python/cudf/cudf/core/reshape.py           |  22 ++---
 python/cudf/cudf/core/tools/datetimes.py   |   4 +-
 python/cudf/cudf/core/udf/groupby_utils.py |   2 +-
 python/cudf/cudf/core/udf/utils.py         |  18 ++--
 python/cudf/cudf/io/csv.py                 |  13 ++-
 python/cudf/cudf/io/dlpack.py              |   6 +-
 python/cudf/cudf/io/orc.py                 |   4 +-
 python/cudf/cudf/testing/testing.py        |   2 +-
 python/cudf/cudf/tests/test_multiindex.py  |  12 +--
 24 files changed, 223 insertions(+), 210 deletions(-)

diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index e661059faa3..e6c2d136f0d 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -23,9 +23,9 @@ def concat_columns(object columns):
 def concat_tables(object tables, bool ignore_index=False):
     plc_tables = []
     for table in tables:
-        cols = table._data.columns
+        cols = table._columns
         if not ignore_index:
-            cols = table._index._data.columns + cols
+            cols = table._index._columns + cols
         plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols]))
 
     return data_from_pylibcudf_table(
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 16182e31c08..49714091f46 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -384,7 +384,7 @@ cdef class _CPackedColumns:
 
         p.column_names = input_table._column_names
         p.column_dtypes = {}
-        for name, col in input_table._data.items():
+        for name, col in input_table._column_labels_and_values:
             if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
                 p.column_dtypes[name] = col.dtype
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 058e884e08b..9ad96f610b3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -273,7 +273,7 @@ def read_csv(
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
                 if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._data.names[index]
+                    col_name = df._column_names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
     if names is not None and len(names) and isinstance(names[0], int):
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index b1900138d94..564daefbae2 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -179,7 +179,7 @@ cdef update_struct_field_names(
 ):
     # Deprecated, remove in favor of add_col_struct_names
     # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._data.items()):
+    for i, (name, col) in enumerate(table._column_labels_and_values):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
         )
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e6c9d60b05b..fa2690c7f21 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -235,16 +235,16 @@ cdef object _process_metadata(object df,
             df._index = idx
         elif set(index_col).issubset(names):
             index_data = df[index_col]
-            actual_index_names = list(index_col_names.values())
-            if len(index_data._data) == 1:
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
                 idx = cudf.Index._from_column(
-                    index_data._data.columns[0],
-                    name=actual_index_names[0]
+                    index_data._columns[0],
+                    name=next(actual_index_names)
                 )
             else:
                 idx = cudf.MultiIndex.from_frame(
                     index_data,
-                    names=actual_index_names
+                    names=list(actual_index_names)
                 )
             df.drop(columns=index_col, inplace=True)
             df._index = idx
@@ -252,7 +252,7 @@ cdef object _process_metadata(object df,
             if use_pandas_metadata:
                 df.index.names = index_col
 
-    if len(df._data.names) == 0 and column_index_type is not None:
+    if df._num_columns == 0 and column_index_type is not None:
         df._data.label_dtype = cudf.dtype(column_index_type)
 
     return df
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index cae28d02ef4..8660cca9322 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
         If True, don't include the index in the columns.
     """
     return table_view_from_columns(
-        tbl._index._data.columns + tbl._data.columns
+        tbl._index._columns + tbl._columns
         if not ignore_index and tbl._index is not None
-        else tbl._data.columns
+        else tbl._columns
     )
 
 
@@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index):
     index_descriptors = []
     columns_to_convert = list(table._columns)
     # Columns
-    for name, col in table._data.items():
+    for name, col in table._column_labels_and_values:
         if cudf.get_option("mode.pandas_compatible"):
             # in pandas-compat mode, non-string column names are stringified.
             col_names.append(str(name))
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ff114474aa4..a6abd63d042 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1951,7 +1951,7 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._data)),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 09b0f453692..bc093fdaa9a 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
     def __delitem__(self, key: abc.Hashable) -> None:
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         del self._data[key]
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
@@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]:
 
     @property
     def nlevels(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         if not self.multiindex:
             return 1
@@ -226,7 +226,7 @@ def name(self) -> abc.Hashable:
 
     @cached_property
     def nrows(self) -> int:
-        if len(self._data) == 0:
+        if len(self) == 0:
             return 0
         else:
             return len(next(iter(self.values())))
@@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         Parameters
         ----------
         old_ncols: int
-            len(self._data) before self._data was modified
+            len(self) before self._data was modified
         new_ncols: int
-            len(self._data) after self._data was modified
+            len(self) after self._data was modified
         """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
@@ -335,7 +335,7 @@ def insert(
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         if loc == -1:
             loc = old_ncols
         elif not (0 <= loc <= old_ncols):
@@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple:
         tuple
         """
         if isinstance(index, slice):
-            start, stop, step = index.indices(len(self._data))
+            start, stop, step = index.indices(len(self))
             return self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
             return (self.names[index],)
@@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         if len(self) > 0 and len(value) != self.nrows:
             raise ValueError("All columns must be of equal length")
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data[key] = value
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: tuple) -> Self:
@@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None:
         if level < 0:
             level += self.nlevels
 
-        old_ncols = len(self._data)
+        old_ncols = len(self)
         self._data = {
             _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
-        new_ncols = len(self._data)
+        new_ncols = len(self)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d73ad8225ca..16b0aa95c35 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg):
         return False
 
     @_performance_tracking
-    def _downcast_to_series(self, df, arg):
+    def _downcast_to_series(self, df: DataFrame, arg):
         """
         "Downcast" from a DataFrame to a Series
         based on Pandas indexing rules
@@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg):
 
         # take series along the axis:
         if axis == 1:
-            return df[df._data.names[0]]
+            return df[df._column_names[0]]
         else:
             if df._num_columns > 0:
                 dtypes = df.dtypes.values.tolist()
                 normalized_dtype = np.result_type(*dtypes)
-                for name, col in df._data.items():
+                for name, col in df._column_labels_and_values:
                     df[name] = col.astype(normalized_dtype)
 
             sr = df.T
-            return sr[sr._data.names[0]]
+            return sr[sr._column_names[0]]
 
 
 class _DataFrameLocIndexer(_DataFrameIndexer):
@@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg):
                     and len(arg) > 1
                     and is_scalar(arg[1])
                 ):
-                    return result._data.columns[0].element_indexing(0)
+                    return result._columns[0].element_indexing(0)
                 return result
         else:
             if isinstance(arg[0], slice):
@@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg):
                 else:
                     tmp_col_name = str(uuid4())
                     cantor_name = "_" + "_".join(
-                        map(str, columns_df._data.names)
+                        map(str, columns_df._column_names)
                     )
                     if columns_df._data.multiindex:
                         # column names must be appropriate length tuples
@@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value):
                             else column.column_empty_like(
                                 col, masked=True, newsize=length
                             )
-                            for key, col in self._data.items()
+                            for key, col in self._column_labels_and_values
                         )
                         self._data = self._data._from_columns_like_self(
                             new_columns, verify=False
@@ -1494,8 +1494,8 @@ def __delitem__(self, name):
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False) -> cudf.Series:
-        mem_usage = [col.memory_usage for col in self._data.columns]
-        names = [str(name) for name in self._data.names]
+        mem_usage = [col.memory_usage for col in self._columns]
+        names = [str(name) for name in self._column_names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
@@ -1725,7 +1725,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f.index._data.columns)
+                else list(f.index._columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1808,7 +1808,7 @@ def _concat(
                 out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(out.index)
-        for name, col in out._data.items():
+        for name, col in out._column_labels_and_values:
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
             )
@@ -1831,13 +1831,13 @@ def astype(
         errors: Literal["raise", "ignore"] = "raise",
     ):
         if is_dict_like(dtype):
-            if len(set(dtype.keys()) - set(self._data.names)) > 0:
+            if len(set(dtype.keys()) - set(self._column_names)) > 0:
                 raise KeyError(
                     "Only a column name can be used for the "
                     "key in a dtype mappings argument."
                 )
         else:
-            dtype = {cc: dtype for cc in self._data.names}
+            dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
     def _clean_renderable_dataframe(self, output):
@@ -2601,7 +2601,7 @@ def equals(self, other) -> bool:
         # If all other checks matched, validate names.
         if ret:
             for self_name, other_name in zip(
-                self._data.names, other._data.names
+                self._column_names, other._column_names
             ):
                 if self_name != other_name:
                     ret = False
@@ -2676,7 +2676,7 @@ def columns(self, columns):
             )
 
         self._data = ColumnAccessor(
-            data=dict(zip(pd_columns, self._data.columns)),
+            data=dict(zip(pd_columns, self._columns)),
             multiindex=multiindex,
             level_names=level_names,
             label_dtype=label_dtype,
@@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
                 f"got {len(self)} elements"
             )
         self._data = ColumnAccessor(
-            data=dict(zip(other.names, self._data.columns)),
+            data=dict(zip(other.names, self._columns)),
             multiindex=other.multiindex,
             rangeindex=other.rangeindex,
             level_names=other.level_names,
@@ -2983,7 +2983,7 @@ def set_index(
             elif isinstance(col, (MultiIndex, pd.MultiIndex)):
                 if isinstance(col, pd.MultiIndex):
                     col = MultiIndex.from_pandas(col)
-                data_to_add.extend(col._data.columns)
+                data_to_add.extend(col._columns)
                 names.extend(col.names)
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
@@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
         out = []
-        for (name, col), other_col in zip(self._data.items(), other_cols):
+        for (name, col), other_col in zip(
+            self._column_labels_and_values, other_cols
+        ):
             source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
@@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                             column.column_empty_like(
                                 col_data, masked=True, newsize=length
                             )
-                            for col_data in self._data.values()
+                            for col_data in self._columns
                         ),
                         verify=False,
                     )
@@ -3664,7 +3666,7 @@ def rename(
                             name: col.find_and_replace(
                                 to_replace, vals, is_all_na
                             )
-                            for name, col in self.index._data.items()
+                            for name, col in self.index._column_labels_and_values
                         }
                     )
                 except OverflowError:
@@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            prefix + col_name for col_name in list(self._data.keys())
-        ]
+        out.columns = [prefix + col_name for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None):
             raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
-        out.columns = [
-            col_name + suffix for col_name in list(self._data.keys())
-        ]
+        out.columns = [col_name + suffix for col_name in self._column_names]
         return out
 
     @_performance_tracking
@@ -4805,7 +4803,7 @@ def _func(x):  # pragma: no cover
         # TODO: naive implementation
         # this could be written as a single kernel
         result = {}
-        for name, col in self._data.items():
+        for name, col in self._column_labels_and_values:
             apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
@@ -5444,7 +5442,7 @@ def to_pandas(
         out_index = self.index.to_pandas()
         out_data = {
             i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
-            for i, col in enumerate(self._data.columns)
+            for i, col in enumerate(self._columns)
         }
 
         out_df = pd.DataFrame(out_data, index=out_index)
@@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    index_descr = list(index._data.names)
+                    index_descr = index._column_names
                     index_levels = index.levels
                 else:
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
                 data = data.copy(deep=False)
-                for gen_name, col_name in zip(index_descr, index._data.names):
+                for gen_name, col_name in zip(
+                    index_descr, index._column_names
+                ):
                     data._insert(
                         data.shape[1],
                         gen_name,
@@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table:
 
         out = super(DataFrame, data).to_arrow()
         metadata = pa.pandas_compat.construct_metadata(
-            columns_to_convert=[self[col] for col in self._data.names],
+            columns_to_convert=[self[col] for col in self._column_names],
             df=self,
             column_names=out.schema.names,
             index_levels=index_levels,
@@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
                 "column_dtypes is currently not supported."
             )
         members = [("index", self.index.dtype)] if index else []
-        members += [(col, self[col].dtype) for col in self._data.names]
+        members += list(self._dtypes)
         dtype = np.dtype(members)
         ret = np.recarray(len(self), dtype=dtype)
         if index:
             ret["index"] = self.index.to_numpy()
-        for col in self._data.names:
+        for col in self._column_names:
             ret[col] = self[col].to_numpy()
         return ret
 
@@ -6059,7 +6059,7 @@ def quantile(
             )
 
         if columns is None:
-            columns = data_df._data.names
+            columns = set(data_df._column_names)
 
         if isinstance(q, numbers.Number):
             q_is_number = True
@@ -6084,7 +6084,7 @@ def quantile(
             # Ensure that qs is non-scalar so that we always get a column back.
             interpolation = interpolation or "linear"
             result = {}
-            for k in data_df._data.names:
+            for k in data_df._column_names:
                 if k in columns:
                     ser = data_df[k]
                     res = ser.quantile(
@@ -6198,7 +6198,7 @@ def make_false_column_like_self():
                 if isinstance(values, DataFrame)
                 else {name: values._column for name in self._data}
             )
-            for col, self_col in self._data.items():
+            for col, self_col in self._column_labels_and_values:
                 if col in other_cols:
                     other_col = other_cols[col]
                     self_is_cat = isinstance(self_col, CategoricalColumn)
@@ -6231,13 +6231,13 @@ def make_false_column_like_self():
                 else:
                     result[col] = make_false_column_like_self()
         elif is_dict_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 if name in values:
                     result[name] = col.isin(values[name])
                 else:
                     result[name] = make_false_column_like_self()
         elif is_list_like(values):
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 result[name] = col.isin(values)
         else:
             raise TypeError(
@@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
                     else as_column(True, length=len(filtered._data[name]))
-                    for name in filtered._data.names
+                    for name in filtered._column_names
                 }
             )
             mask = mask.all(axis=1)
@@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_column(
             as_column([length - col.null_count for col in self._columns]),
-            index=cudf.Index(self._data.names),
+            index=cudf.Index(self._column_names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6409,7 +6409,7 @@ def _reduce(
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
             axis_0_results = []
-            for col_label, col in source._data.items():
+            for col_label, col in source._column_labels_and_values:
                 try:
                     axis_0_results.append(getattr(col, op)(**kwargs))
                 except AttributeError as err:
@@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         prepared, mask, common_dtype = self._prepare_for_rowwise_op(
             method, skipna, numeric_only
         )
-        for col in prepared._data.names:
+        for col in prepared._column_names:
             if prepared._data[col].nullable:
                 prepared._data[col] = (
                     prepared._data[col]
@@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None):
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
 
-        for k, col in self._data.items():
+        for k, col in self._column_labels_and_values:
             infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
             if infered_type in inclusion:
                 df._insert(len(df._data), k, col)
@@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Compute the column indices that serves as the input for
         # `interleave_columns`
         column_idx_df = pd.DataFrame(
-            data=range(len(self._data)), index=named_levels
+            data=range(self._num_columns), index=named_levels
         )
 
         column_indices: list[list[int]] = []
@@ -7392,17 +7392,17 @@ def to_struct(self, name=None):
         -----
         Note: a copy of the columns is made.
         """
-        if not all(isinstance(name, str) for name in self._data.names):
+        if not all(isinstance(name, str) for name in self._column_names):
             warnings.warn(
                 "DataFrame contains non-string column name(s). Struct column "
                 "requires field name to be string. Non-string column names "
                 "will be casted to string as the field name."
             )
-        fields = {str(name): col.dtype for name, col in self._data.items()}
+        fields = {str(name): dtype for name, dtype in self._dtypes}
         col = StructColumn(
             data=None,
             dtype=cudf.StructDtype(fields=fields),
-            children=tuple(col.copy(deep=True) for col in self._data.columns),
+            children=tuple(col.copy(deep=True) for col in self._columns),
             size=len(self),
             offset=0,
         )
@@ -7984,7 +7984,7 @@ def value_counts(
             diff = set(subset) - set(self._data)
             if len(diff) != 0:
                 raise KeyError(f"columns {diff} do not exist")
-        columns = list(self._data.names) if subset is None else subset
+        columns = list(self._column_names) if subset is None else subset
         result = (
             self.groupby(
                 by=columns,
@@ -8105,7 +8105,7 @@ def func(left, right, output):
                 right._column_names
             )
         elif _is_scalar_or_zero_d_array(right):
-            for name, col in output._data.items():
+            for name, col in output._column_labels_and_values:
                 output._data[name] = col.fillna(value)
             return output
         else:
@@ -8387,7 +8387,7 @@ def extract_col(df, col):
             and col not in df.index._data
             and not isinstance(df.index, MultiIndex)
         ):
-            return df.index._data.columns[0]
+            return df.index._column
         return df.index._data[col]
 
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b2bc85b13b..98af006f6e5 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
-    def _dtypes(self) -> abc.Iterable:
-        return zip(self._data.names, (col.dtype for col in self._data.columns))
+    def _column_labels_and_values(
+        self,
+    ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]:
+        return zip(self._column_names, self._columns)
+
+    @property
+    def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]:
+        for label, col in self._column_labels_and_values:
+            yield label, col.dtype
 
     @property
     def ndim(self) -> int:
@@ -87,7 +94,7 @@ def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
             "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(tuple(self._data.names)),
+            "column_names": pickle.dumps(self._column_names),
             "column_rangeindex": pickle.dumps(self._data.rangeindex),
             "column_multiindex": pickle.dumps(self._data.multiindex),
             "column_label_dtype": pickle.dumps(self._data.label_dtype),
@@ -156,7 +163,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
         if inplace:
-            for col in self._data:
+            for col in self._column_names:
                 if col in result._data:
                     self._data[col]._mimic_inplace(
                         result._data[col], inplace=True
@@ -267,7 +274,7 @@ def __len__(self) -> int:
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
-            for col_name, col in self._data.items()
+            for col_name, col in self._column_labels_and_values
         )
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
@@ -338,9 +345,7 @@ def equals(self, other) -> bool:
 
         return all(
             self_col.equals(other_col, check_dtypes=True)
-            for self_col, other_col in zip(
-                self._data.values(), other._data.values()
-            )
+            for self_col, other_col in zip(self._columns, other._columns)
         )
 
     @_performance_tracking
@@ -434,11 +439,9 @@ def to_array(
 
         if dtype is None:
             if ncol == 1:
-                dtype = next(iter(self._data.values())).dtype
+                dtype = next(self._dtypes)[1]
             else:
-                dtype = find_common_type(
-                    [col.dtype for col in self._data.values()]
-                )
+                dtype = find_common_type([dtype for _, dtype in self._dtypes])
 
             if not isinstance(dtype, numpy.dtype):
                 raise NotImplementedError(
@@ -446,12 +449,12 @@ def to_array(
                 )
 
         if self.ndim == 1:
-            return to_array(self._data.columns[0], dtype)
+            return to_array(self._columns[0], dtype)
         else:
             matrix = module.empty(
                 shape=(len(self), ncol), dtype=dtype, order="F"
             )
-            for i, col in enumerate(self._data.values()):
+            for i, col in enumerate(self._columns):
                 # TODO: col.values may fail if there is nullable data or an
                 # unsupported dtype. We may want to catch and provide a more
                 # suitable error.
@@ -751,7 +754,7 @@ def fillna(
 
         filled_columns = [
             col.fillna(value[name], method) if name in value else col.copy()
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         ]
 
         return self._mimic_inplace(
@@ -988,7 +991,10 @@ def to_arrow(self):
         index: [[1,2,3]]
         """
         return pa.Table.from_pydict(
-            {str(name): col.to_arrow() for name, col in self._data.items()}
+            {
+                str(name): col.to_arrow()
+                for name, col in self._column_labels_and_values
+            }
         )
 
     @_performance_tracking
@@ -1012,7 +1018,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         See `ColumnBase._with_type_metadata` for more information.
         """
-        for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
+        for (name, col), (_, dtype) in zip(
+            self._column_labels_and_values, other._dtypes
+        ):
             self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
@@ -1422,7 +1430,7 @@ def _split(self, splits):
         """
         return [
             self._from_columns_like_self(
-                libcudf.copying.columns_split([*self._data.columns], splits)[
+                libcudf.copying.columns_split(list(self._columns), splits)[
                     split_idx
                 ],
                 self._column_names,
@@ -1432,7 +1440,7 @@ def _split(self, splits):
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode([*self._columns])
+        columns, indices = libcudf.transform.table_encode(list(self._columns))
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
@@ -1578,7 +1586,7 @@ def __neg__(self):
                     col.unary_operator("not")
                     if col.dtype.kind == "b"
                     else -1 * col
-                    for col in self._data.columns
+                    for col in self._columns
                 )
             )
         )
@@ -1840,9 +1848,7 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            self._data._from_columns_like_self(
-                (~col for col in self._data.columns)
-            )
+            self._data._from_columns_like_self((~col for col in self._columns))
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6424c8af877..cb8cd0cd28b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(
-                    self.grouping.keys.drop_duplicates()._data.columns
-                )
-                right_cols = list(result_index._data.columns)
+                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
+                right_cols = list(result_index._columns)
                 join_keys = [
                     _match_join_keys(lcol, rcol, "left")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -1483,7 +1481,7 @@ def _post_process_chunk_results(
                     # the column name should be, especially if we applied
                     # a nameless UDF.
                     result = result.to_frame(
-                        name=grouped_values._data.names[0]
+                        name=grouped_values._column_names[0]
                     )
                 else:
                     index_data = group_keys._data.copy(deep=True)
@@ -1632,7 +1630,7 @@ def mult(df):
             if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
-                for name, col in res._data.items():
+                for name, col in res._column_labels_and_values:
                     if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
@@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     def _reduce_numeric_only(self, op: str):
         columns = list(
             name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
-            )
+            for name, dtype in self.obj._dtypes
+            if (is_numeric_dtype(dtype) and name not in self.grouping.names)
         )
         return self[columns].agg(op)
 
@@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame:
         """
         # If the key columns are in `obj`, filter them out
         value_column_names = [
-            x for x in self._obj._data.names if x not in self._named_columns
+            x for x in self._obj._column_names if x not in self._named_columns
         ]
         value_columns = self._obj._data.select_by_label(value_column_names)
         return self._obj.__class__._from_data(value_columns)
@@ -3224,8 +3219,8 @@ def _handle_series(self, by):
         self.names.append(by.name)
 
     def _handle_index(self, by):
-        self._key_columns.extend(by._data.columns)
-        self.names.extend(by._data.names)
+        self._key_columns.extend(by._columns)
+        self.names.extend(by._column_names)
 
     def _handle_mapping(self, by):
         by = cudf.Series(by.values(), index=by.keys())
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b2bd20c4982..cd07c58c5d9 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -122,13 +122,13 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns],
+        list(sort_vals._columns),
         keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
@@ -286,6 +286,20 @@ def name(self):
     def name(self, value):
         self._name = value
 
+    @property
+    @_performance_tracking
+    def _column_names(self) -> tuple[Any]:
+        return (self.name,)
+
+    @property
+    @_performance_tracking
+    def _columns(self) -> tuple[ColumnBase]:
+        return (self._values,)
+
+    @property
+    def _column_labels_and_values(self) -> Iterable:
+        return zip(self._column_names, self._columns)
+
     @property  # type: ignore
     @_performance_tracking
     def start(self) -> int:
@@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
             else:
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
 
             data = self._apply_cupy_ufunc_to_operands(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fd6bf37f0e6..810d4ad74e7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -294,7 +294,7 @@ def _num_rows(self) -> int:
 
     @property
     def _index_names(self) -> tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self.index._data.names
+        return self.index._column_names
 
     @classmethod
     def _from_data(
@@ -307,6 +307,7 @@ def _from_data(
             raise ValueError(
                 f"index must be None or a cudf.Index not {type(index).__name__}"
             )
+        # out._num_rows requires .index to be defined
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -882,7 +883,7 @@ def replace(
                 columns_dtype_map=dict(self._dtypes),
             )
             copy_data = []
-            for name, col in self._data.items():
+            for name, col in self._column_labels_and_values:
                 try:
                     replaced = col.find_and_replace(
                         to_replace_per_column[name],
@@ -2703,11 +2704,11 @@ def sort_index(
                         by.extend(
                             filter(
                                 lambda n: n not in handled,
-                                self.index._data.names,
+                                self.index._column_names,
                             )
                         )
                 else:
-                    by = list(idx._data.names)
+                    by = list(idx._column_names)
 
                 inds = idx._get_sorted_inds(
                     by=by, ascending=ascending, na_position=na_position
@@ -3013,7 +3014,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self.index._data.columns
+                self.index._columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3210,7 +3211,7 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self.index._data.columns if keep_index else ()),
+                    *(self.index._columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
@@ -3227,7 +3228,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self.index._data.columns if keep_index else []),
+                *(self.index._columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3763,8 +3764,8 @@ def _reindex(
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
                 _is_same_dtype(left_dtype, right_dtype)
                 for left_dtype, right_dtype in zip(
-                    (col.dtype for col in df.index._data.columns),
-                    (col.dtype for col in index._data.columns),
+                    (dtype for _, dtype in df.index._dtypes),
+                    (dtype for _, dtype in index._dtypes),
                 )
             )
 
@@ -3783,7 +3784,7 @@ def _reindex(
                         (name or 0)
                         if isinstance(self, cudf.Series)
                         else name: col
-                        for name, col in df._data.items()
+                        for name, col in df._column_labels_and_values
                     },
                     index=df.index,
                 )
@@ -3794,7 +3795,7 @@ def _reindex(
         index = index if index is not None else df.index
 
         if column_names is None:
-            names = list(df._data.names)
+            names = list(df._column_names)
             level_names = self._data.level_names
             multiindex = self._data.multiindex
             rangeindex = self._data.rangeindex
@@ -3948,7 +3949,7 @@ def round(self, decimals=0, how="half_even"):
             col.round(decimals[name], how=how)
             if name in decimals and col.dtype.kind in "fiu"
             else col.copy(deep=True)
-            for name, col in self._data.items()
+            for name, col in self._column_labels_and_values
         )
         return self._from_data_like_self(
             self._data._from_columns_like_self(cols)
@@ -4270,7 +4271,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
             else:
                 thresh = len(df)
 
-        for name, col in df._data.items():
+        for name, col in df._column_labels_and_values:
             check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
@@ -4305,7 +4306,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self.index._data.columns, *data_columns],
+                [*self.index._columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4853,7 +4854,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                 # This works for Index too
                 inputs = {
                     name: (col, None, False, None)
-                    for name, col in self._data.items()
+                    for name, col in self._column_labels_and_values
                 }
                 index = self.index
 
@@ -4933,7 +4934,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self.index._data.columns, *self._columns], repeats, axis
+                [*self.index._columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -6224,7 +6225,7 @@ def _preprocess_subset(self, subset):
             not np.iterable(subset)
             or isinstance(subset, str)
             or isinstance(subset, tuple)
-            and subset in self._data.names
+            and subset in self._column_names
         ):
             subset = (subset,)
         diff = set(subset) - set(self._data)
@@ -6306,8 +6307,8 @@ def rank(
                 )
             numeric_cols = (
                 name
-                for name in self._data.names
-                if _is_non_decimal_numeric_dtype(self._data[name])
+                for name, dtype in self._dtypes
+                if _is_non_decimal_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index b65bc7af832..cfeaca00888 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -140,11 +140,15 @@ def __init__(
         # right_on.
         self._using_left_index = bool(left_index)
         left_on = (
-            lhs.index._data.names if left_index else left_on if left_on else on
+            lhs.index._column_names
+            if left_index
+            else left_on
+            if left_on
+            else on
         )
         self._using_right_index = bool(right_index)
         right_on = (
-            rhs.index._data.names
+            rhs.index._column_names
             if right_index
             else right_on
             if right_on
@@ -334,18 +338,18 @@ def _merge_results(
         # All columns from the left table make it into the output. Non-key
         # columns that share a name with a column in the right table are
         # suffixed with the provided suffix.
-        common_names = set(left_result._data.names) & set(
-            right_result._data.names
+        common_names = set(left_result._column_names) & set(
+            right_result._column_names
         )
         cols_to_suffix = common_names - self._key_columns_with_same_name
         data = {
             (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
-            for name, col in left_result._data.items()
+            for name, col in left_result._column_labels_and_values
         }
 
         # The right table follows the same rule as the left table except that
         # key columns from the right table are removed.
-        for name, col in right_result._data.items():
+        for name, col in right_result._column_labels_and_values:
             if name in common_names:
                 if name not in self._key_columns_with_same_name:
                     data[f"{name}{self.rsuffix}"] = col
@@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: list[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result.index._data.columns)
+            by.extend(result.index._columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b86ad38c944..6de3981ba66 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -233,8 +233,8 @@ def names(self, value):
             # to unexpected behavior in some cases. This is
             # definitely buggy, but we can't disallow non-unique
             # names either...
-            self._data = self._data.__class__(
-                dict(zip(value, self._data.values())),
+            self._data = type(self._data)(
+                dict(zip(value, self._columns)),
                 level_names=self._data.level_names,
                 verify=False,
             )
@@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False):
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
-        lookup = cudf.DataFrame()
+        lookup_dict = {}
         for i, row in enumerate(row_tuple):
             if isinstance(row, slice) and row == slice(None):
                 continue
-            lookup[i] = cudf.Series(row)
-        frame = cudf.DataFrame(dict(enumerate(index._data.columns)))
+            lookup_dict[i] = row
+        lookup = cudf.DataFrame(lookup_dict)
+        frame = cudf.DataFrame._from_data(
+            ColumnAccessor(dict(enumerate(index._columns)), verify=False)
+        )
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             data_table = cudf.concat(
                 [
                     frame,
                     cudf.DataFrame._from_data(
-                        {"idx": column.as_column(range(len(frame)))}
+                        ColumnAccessor(
+                            {"idx": column.as_column(range(len(frame)))},
+                            verify=False,
+                        )
                     ),
                 ],
                 axis=1,
@@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         # TODO: Remove this after merge/join
         # obtain deterministic ordering.
         if cudf.get_option("mode.pandas_compatible"):
-            lookup_order = "_" + "_".join(map(str, lookup._data.names))
+            lookup_order = "_" + "_".join(map(str, lookup._column_names))
             lookup[lookup_order] = column.as_column(range(len(lookup)))
             postprocess = operator.methodcaller(
                 "sort_values", by=[lookup_order, "idx"]
@@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_column(index._data.columns[k]),
+                cudf.Series._from_column(index._columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key):
         )
         if need_downcast:
             result = result.T
-            return result[result._data.names[0]]
+            return result[result._column_names[0]]
 
         if len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
             result = cudf.Series._from_data(
-                {}, name=tuple(col[0] for col in index._data.columns)
+                {}, name=tuple(col[0] for col in index._columns)
             )
         elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
-            *_, last_column = index._data.columns
+            last_column = index._columns[-1]
             out_index = cudf.Index._from_column(
                 last_column, name=index.names[-1]
             )
@@ -894,7 +900,7 @@ def __eq__(self, other):
                 [
                     self_col.equals(other_col)
                     for self_col, other_col in zip(
-                        self._data.values(), other._data.values()
+                        self._columns, other._columns
                     )
                 ]
             )
@@ -1475,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             ('aa', 'b')],
            )
         """
-        name_i = self._data.names[i] if isinstance(i, int) else i
-        name_j = self._data.names[j] if isinstance(j, int) else j
+        name_i = self._column_names[i] if isinstance(i, int) else i
+        name_j = self._column_names[j] if isinstance(j, int) else j
         new_data = {}
-        for k, v in self._data.items():
+        for k, v in self._column_labels_and_values:
             if k not in (name_i, name_j):
                 new_data[k] = v
             elif k == name_i:
@@ -1916,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
-            for lcol, rcol in zip(target._data.columns, self._data.columns)
+            for lcol, rcol in zip(target._columns, self._columns)
         ]
         join_keys = map(list, zip(*join_keys))
         scatter_map, indices = libcudf.join.join(
@@ -2113,7 +2119,7 @@ def _split_columns_by_levels(
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-        for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
+        for i, (name, col) in enumerate(zip(self.names, self._columns)):
             if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
                 yield name, col
@@ -2154,9 +2160,7 @@ def _columns_for_reset_index(
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         """Return the columns and column names for .reset_index"""
         if levels is None:
-            for i, (col, name) in enumerate(
-                zip(self._data.columns, self.names)
-            ):
+            for i, (col, name) in enumerate(zip(self._columns, self.names)):
                 yield f"level_{i}" if name is None else name, col
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c951db00c9a..401fef67ee6 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -410,7 +410,7 @@ def concat(
         result_columns = None
         if keys_objs is None:
             for o in objs:
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
@@ -438,7 +438,7 @@ def concat(
         else:
             # All levels in the multiindex label must have the same type
             has_multiple_level_types = (
-                len({type(name) for o in objs for name in o._data.keys()}) > 1
+                len({type(name) for o in objs for name in o._column_names}) > 1
             )
             if has_multiple_level_types:
                 raise NotImplementedError(
@@ -447,7 +447,7 @@ def concat(
                     "the labels to the same type."
                 )
             for k, o in zip(keys_objs, objs):
-                for name, col in o._data.items():
+                for name, col in o._column_labels_and_values:
                     # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
@@ -843,7 +843,7 @@ def get_dummies(
         else:
             result_data = {
                 col_name: col
-                for col_name, col in data._data.items()
+                for col_name, col in data._column_labels_and_values
                 if col_name not in columns
             }
 
@@ -943,7 +943,7 @@ def _merge_sorted(
 
     columns = [
         [
-            *(obj.index._data.columns if not ignore_index else ()),
+            *(obj.index._columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
@@ -985,7 +985,7 @@ def as_tuple(x):
             return x if isinstance(x, tuple) else (x,)
 
         nrows = len(index_labels)
-        for col_label, col in df._data.items():
+        for col_label, col in df._column_labels_and_values:
             names = [
                 as_tuple(col_label) + as_tuple(name) for name in column_labels
             ]
@@ -1009,7 +1009,7 @@ def as_tuple(x):
     ca = ColumnAccessor(
         result,
         multiindex=True,
-        level_names=(None,) + columns._data.names,
+        level_names=(None,) + columns._column_names,
         verify=False,
     )
     return cudf.DataFrame._from_data(
@@ -1087,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     # Create a DataFrame composed of columns from both
     # columns and index
     ca = ColumnAccessor(
-        dict(
-            enumerate(
-                itertools.chain(index._data.columns, columns._data.columns)
-            )
-        ),
+        dict(enumerate(itertools.chain(index._columns, columns._columns))),
         verify=False,
     )
     columns_index = cudf.DataFrame._from_data(ca)
@@ -1560,7 +1556,7 @@ def pivot_table(
     if values_passed and not values_multi and table._data.multiindex:
         column_names = table._data.level_names[1:]
         table_columns = tuple(
-            map(lambda column: column[1:], table._data.names)
+            map(lambda column: column[1:], table._column_names)
         )
         table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7197560b5a4..68f34fa28ff 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -186,7 +186,7 @@ def to_datetime(
         if isinstance(arg, cudf.DataFrame):
             # we require at least Ymd
             required = ["year", "month", "day"]
-            req = list(set(required) - set(arg._data.names))
+            req = list(set(required) - set(arg._column_names))
             if len(req):
                 err_req = ",".join(req)
                 raise ValueError(
@@ -196,7 +196,7 @@ def to_datetime(
                 )
 
             # replace passed column name with values in _unit_map
-            got_units = {k: get_units(k) for k in arg._data.names}
+            got_units = {k: get_units(k) for k in arg._column_names}
             unit_rev = {v: k for k, v in got_units.items()}
 
             # keys we don't recognize
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 265b87350ae..3af662b62ea 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args):
         # See https://github.com/numba/numba/issues/4587
         return False
 
-    if any(col.has_nulls() for col in frame._data.values()):
+    if any(col.has_nulls() for col in frame._columns):
         return False
     np_field_types = np.dtype(
         list(
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 6d7362952c9..bfe716f0afc 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()):
 
 def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        if str(col.dtype) in supported_types
-        else np.dtype("O")
-        for colname, col in frame._data.items()
+        colname: dtype if str(dtype) in supported_types else np.dtype("O")
+        for colname, dtype in frame._dtypes
     }
 
 
 def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
-        colname: col.dtype
-        for colname, col in frame._data.items()
-        if str(col.dtype) in supported_types
+        colname: dtype
+        for colname, dtype in frame._dtypes
+        if str(dtype) in supported_types
     }
 
 
 def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES):
     return {
         colname: col
-        for colname, col in frame._data.items()
+        for colname, col in frame._column_labels_and_values
         if str(col.dtype) in supported_types
     }
 
@@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
         *cudautils.make_cache_key(
             func, tuple(_all_dtypes_from_frame(frame).values())
         ),
-        *(col.mask is None for col in frame._data.values()),
-        *frame._data.keys(),
+        *(col.mask is None for col in frame._columns),
+        *frame._column_names,
         scalar_argtypes,
         suffix,
     )
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index a9c20150930..3dc8915bfd1 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -186,13 +186,13 @@ def to_csv(
                 "Dataframe doesn't have the labels provided in columns"
             )
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.ListColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.ListDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "list columns."
             )
-        elif isinstance(col, cudf.core.column.StructColumn):
+        elif isinstance(dtype, cudf.StructDtype):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
                 "Struct columns."
@@ -203,12 +203,11 @@ def to_csv(
     # workaround once following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/6661
     if any(
-        isinstance(col, cudf.core.column.CategoricalColumn)
-        for col in df._data.columns
+        isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes
     ) or isinstance(df.index, cudf.CategoricalIndex):
         df = df.copy(deep=False)
-        for col_name, col in df._data.items():
-            if isinstance(col, cudf.core.column.CategoricalColumn):
+        for col_name, col in df._column_labels_and_values:
+            if isinstance(col.dtype, cudf.CategoricalDtype):
                 df._data[col_name] = col.astype(col.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 1347b2cc38f..fe8e446f9c0 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -79,13 +79,13 @@ def to_dlpack(cudf_obj):
         )
 
     if any(
-        not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype)
-        for col in gdf._data.columns
+        not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
+        for _, dtype in gdf._dtypes
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [col.dtype for col in gdf._data.columns]
+        [dtype for _, dtype in gdf._dtypes]
     )
     gdf = gdf.astype(dtype)
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index fd246c6215f..c54293badbe 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -396,8 +396,8 @@ def to_orc(
 ):
     """{docstring}"""
 
-    for col in df._data.columns:
-        if isinstance(col, cudf.core.column.CategoricalColumn):
+    for _, dtype in df._dtypes:
+        if isinstance(dtype, cudf.CategoricalDtype):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 31ad24a4664..668e7a77454 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -676,7 +676,7 @@ def assert_frame_equal(
 
     if check_like:
         left, right = left.reindex(index=right.index), right
-        right = right[list(left._data.names)]
+        right = right[list(left._column_names)]
 
     # index comparison
     assert_index_equal(
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b1e095e8853..c41be3e4428 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         mi1 = gdf.groupby(["Date", "Symbol"]).mean().index
         mi2 = mi1.copy(deep=deep)
 
-        lchildren = [col.children for _, col in mi1._data.items()]
-        rchildren = [col.children for _, col in mi2._data.items()]
+        lchildren = [col.children for col in mi1._columns]
+        rchildren = [col.children for col in mi2._columns]
 
         # Flatten
         lchildren = reduce(operator.add, lchildren)
@@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._data identity
-        lptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi1._data.items()
-        ]
-        rptrs = [
-            d.base_data.get_ptr(mode="read") for _, d in mi2._data.items()
-        ]
+        lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns]
+        rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
     cudf.set_option("copy_on_write", original_cow_setting)

From dc57c1b1284816d0e5ed7493e6b661590c305511 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:00:30 -0700
Subject: [PATCH 31/32] Revert "Refactor mixed_semi_join using
 cuco::static_set" (#16855)

Reverting rapidsai/cudf#16230 as this PR leads to https://github.com/rapidsai/cudf/issues/16852.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16855
---
 cpp/src/join/join_common_utils.hpp       |  6 ++
 cpp/src/join/mixed_join_common_utils.cuh | 33 ---------
 cpp/src/join/mixed_join_kernels_semi.cu  | 35 +++++----
 cpp/src/join/mixed_join_kernels_semi.cuh |  6 +-
 cpp/src/join/mixed_join_semi.cu          | 90 +++++++++++++++++-------
 cpp/tests/join/mixed_join_tests.cu       | 30 --------
 6 files changed, 91 insertions(+), 109 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 573101cefd9..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,6 +22,7 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
+#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -50,6 +51,11 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
+
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 89c13285cfe..19701816867 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,7 +25,6 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
-#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -161,38 +160,6 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-struct double_row_equality_comparator {
-  row_equality const equality_comparator;
-  row_equality const conditional_comparator;
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-};
-
-// A CUDA Cooperative Group of 4 threads for the hash set.
-auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
-
-// The hash set type used by mixed_semi_join with the build_table.
-using hash_set_type = cuco::static_set<size_type,
-                                       cuco::extent<size_t>,
-                                       cuda::thread_scope_device,
-                                       double_row_equality_comparator,
-                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
-                                       cudf::detail::cuco_allocator<char>,
-                                       cuco::storage<1>>;
-
-// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
-using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
-
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index f2c5ff13638..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,16 +38,12 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
+                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  hash_set_ref_type set_ref,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
-  auto constexpr cg_size = hash_set_ref_type::cg_size;
-
-  auto const tile =
-    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
-
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -56,24 +52,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = left_num_rows;
 
-  cudf::size_type const outer_num_rows = left_table.num_rows();
-  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
-    // Make sure to swap_tables here as hash_set will use probe table as the left one.
-    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    auto const set_ref_equality = set_ref.with_key_eq(equality);
-    auto const result           = set_ref_equality.contains(tile, outer_row_index);
-    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -82,8 +78,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -97,8 +94,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -108,8 +106,9 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
+        hash_probe,
         equality_probe,
-        set_ref,
+        hash_table_view,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index b08298e64e4..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,8 +45,9 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] set_ref The hash table device view built from `build`.
+ * @param[in] hash_table_view The hash table built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -57,8 +58,9 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
+                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            hash_set_ref_type set_ref,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 719b1d47105..cfb785e242c 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -46,6 +46,45 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+struct make_pair_function_semi {
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // The value is irrelevant since we only ever use the hash map to check for
+    // membership of a particular row index.
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
+  }
+};
+
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+class double_row_equality {
+ public:
+  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
+    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
+  {
+  }
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+
+ private:
+  row_equality _equality_comparator;
+  row_equality _conditional_comparator;
+};
+
+}  // namespace
+
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -57,7 +96,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -98,7 +137,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -117,20 +156,27 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
+    experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+    experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
+
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-
+  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -145,28 +191,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
+  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
+  make_pair_function_semi pair_func_build{};
 
-  hash_set_type row_set{
-    {compute_hash_table_size(build.num_rows())},
-    cuco::empty_key{JoinNoneValue},
-    {equality_build_equality, equality_build_conditional},
-    {row_hash_build.device_hasher(build_nulls)},
-    {},
-    {},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    {stream.value()}};
-
-  auto iter = thrust::make_counting_iterator(0);
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    row_set.insert(iter, iter + right_num_rows, stream.value());
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -174,19 +212,18 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
   }
 
+  auto hash_table_view = hash_table.get_device_view();
+
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block =
-    parser.shmem_per_thread *
-    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
-
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -195,8 +232,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
+                         hash_probe,
                          equality_probe,
-                         row_set_ref,
+                         hash_table_view,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 08a0136700d..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,21 +778,6 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
-TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {2, 7, 8});
-}
-
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -915,18 +900,3 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
-
-TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
-{
-  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
-  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto left_one_greater_right_one =
-    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
-
-  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
-             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
-             {0},
-             {1},
-             left_one_greater_right_one,
-             {0, 1, 3, 4, 5, 6, 9});
-}

From 267692490ba245404bf09c526bd61375ba72493b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 19 Sep 2024 20:52:08 -0500
Subject: [PATCH 32/32] Switch to using native `traceback` (#16851)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR switches pytest traceback to `native` instead of prettified pytest traceback that takes longer to finish and spits out the source code of the file where the error happens too which is not needed given the time savings.

With pytest traceback:
<img width="1063" alt="Screenshot 2024-09-19 at 2 34 57 PM" src="https://github.com/user-attachments/assets/9658dd5a-eeb9-4ded-8c77-21b71c74d0a5">
<img width="1073" alt="Screenshot 2024-09-19 at 2 35 07 PM" src="https://github.com/user-attachments/assets/b8500e8a-9d7d-4c0d-8b9a-b2546a0741ee">
<img width="1065" alt="Screenshot 2024-09-19 at 2 35 20 PM" src="https://github.com/user-attachments/assets/a7c2925d-f94d-4b74-97a5-e3d2a0ebf36c">

With `native` traceback:
<img width="713" alt="Screenshot 2024-09-19 at 2 34 04 PM" src="https://github.com/user-attachments/assets/e540bc4b-c351-4815-b2dd-dfe4bb491ecb">

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/16851
---
 ci/test_wheel_cudf.sh                                         | 1 +
 ci/test_wheel_dask_cudf.sh                                    | 2 ++
 python/cudf/benchmarks/pytest.ini                             | 1 +
 python/cudf/cudf/tests/pytest.ini                             | 1 +
 .../third_party_integration_tests/tests/pytest.ini            | 3 +++
 python/cudf_kafka/cudf_kafka/tests/pytest.ini                 | 4 ++++
 python/cudf_polars/tests/pytest.ini                           | 4 ++++
 python/custreamz/custreamz/tests/pytest.ini                   | 4 ++++
 python/dask_cudf/dask_cudf/tests/pytest.ini                   | 4 ++++
 python/pylibcudf/pylibcudf/tests/pytest.ini                   | 1 +
 10 files changed, 25 insertions(+)
 create mode 100644 python/cudf_kafka/cudf_kafka/tests/pytest.ini
 create mode 100644 python/cudf_polars/tests/pytest.ini
 create mode 100644 python/custreamz/custreamz/tests/pytest.ini
 create mode 100644 python/dask_cudf/dask_cudf/tests/pytest.ini

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 28ded2f8e0f..a701bfe15e0 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf"
 pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
+  --numprocesses=8 \
   --dist=worksteal \
   .
 popd
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 0d39807d56c..361a42ccda9 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
 
@@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf
 DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
   --numprocesses=8 \
+  --dist=worksteal \
   .
 popd
diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini
index db24415ef9e..187d91996b2 100644
--- a/python/cudf/benchmarks/pytest.ini
+++ b/python/cudf/benchmarks/pytest.ini
@@ -6,3 +6,4 @@ python_classes = Bench
 python_functions = bench_*
 markers =
     pandas_incompatible: mark a benchmark that cannot be run with pandas
+addopts = --tb=native
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 2136bca0e28..8a594794fac 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -14,3 +14,4 @@ filterwarnings =
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
     ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
+addopts = --tb=native
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
index 817d98e6ba2..98459035298 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -1,3 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
 [pytest]
 xfail_strict=true
 markers=
@@ -5,3 +7,4 @@ markers=
     xfail_gold: this test is expected to fail in the gold pass
     xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
     xfail_compare: this test is expected to fail in the comparison pass
+addopts = --tb=native
diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/cudf_polars/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/custreamz/custreamz/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini
new file mode 100644
index 00000000000..7b0a9f29fb1
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/pytest.ini
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
index 1761c0f011c..f572f85ca49 100644
--- a/python/pylibcudf/pylibcudf/tests/pytest.ini
+++ b/python/pylibcudf/pylibcudf/tests/pytest.ini
@@ -6,3 +6,4 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+addopts = --tb=native