From 7d5e527ad2812a9f1d3334dd1d8d4686c494df18 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 21 Sep 2023 11:27:41 -0500
Subject: [PATCH 1/2] Use stream pool for gather.

---
 cpp/include/cudf/detail/gather.cuh            | 22 +++++++++++++++----
 .../cudf/detail/utilities/stream_pool.hpp     |  2 +-
 2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 955f9914632..5b4d1243c49 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -658,10 +659,19 @@ std::unique_ptr<table> gather(table_view const& source_table,
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
-  // TODO: Could be beneficial to use streams internally here
+  // The data gather for n columns will be executed over n streams. If there is
+  // only a single column, the fork/join overhead should be avoided.
+  auto const num_columns = source_table.num_columns();
+  auto streams           = std::vector<rmm::cuda_stream_view>{};
+  if (num_columns > 1) {
+    streams = cudf::detail::fork_streams(stream, num_columns);
+  } else {
+    streams.push_back(stream);
+  }
 
-  for (auto const& source_column : source_table) {
-    // The data gather for n columns will be put on the first n streams
+  for (auto i = 0; i < num_columns; ++i) {
+    CUDF_FUNC_RANGE();
+    auto const& source_column = source_table.column(i);
     destination_columns.push_back(
       cudf::type_dispatcher<dispatch_storage_type>(source_column.type(),
                                                    column_gatherer{},
@@ -669,7 +679,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                                                    gather_map_begin,
                                                    gather_map_end,
                                                    bounds_policy == out_of_bounds_policy::NULLIFY,
-                                                   stream,
+                                                   streams[i],
                                                    mr));
   }
 
@@ -683,6 +693,10 @@ std::unique_ptr<table> gather(table_view const& source_table,
     gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
   }
 
+  // Join streams as late as possible so that the gather_bitmask can run on its
+  // own stream while other streams are gathering. Skip joining if only one
+  // column, since it used the passed in stream rather than forking.
+  if (num_columns > 1) { cudf::detail::join_streams(streams, stream); }
   return std::make_unique<table>(std::move(destination_columns));
 }
 
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 95384a9d73e..70014d18fb8 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -40,7 +40,7 @@ namespace cudf::detail {
  * auto const num_streams = 2;
  * // do work on stream
  * // allocate streams and wait for an event on stream before executing on any of streams
- * auto streams = cudf::detail::fork_stream(stream, num_streams);
+ * auto streams = cudf::detail::fork_streams(stream, num_streams);
  * // do work on streams[0] and streams[1]
  * // wait for event on streams before continuing to do work on stream
  * cudf::detail::join_streams(streams, stream);

From 988f4d0f69e067a9cc2ffe8d13acf16f1da4a850 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 21 Sep 2023 14:57:40 -0500
Subject: [PATCH 2/2] Use stream pool for scatter.

---
 cpp/include/cudf/detail/gather.cuh  | 42 +++++++++++++------------
 cpp/include/cudf/detail/scatter.cuh | 49 +++++++++++++++++++----------
 2 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5b4d1243c49..cebcd7fc3ac 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -657,31 +657,32 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  std::vector<std::unique_ptr<column>> destination_columns;
+  auto const num_columns = source_table.num_columns();
+  auto result            = std::vector<std::unique_ptr<column>>(num_columns);
 
   // The data gather for n columns will be executed over n streams. If there is
   // only a single column, the fork/join overhead should be avoided.
-  auto const num_columns = source_table.num_columns();
-  auto streams           = std::vector<rmm::cuda_stream_view>{};
+  auto streams = std::vector<rmm::cuda_stream_view>{};
   if (num_columns > 1) {
     streams = cudf::detail::fork_streams(stream, num_columns);
   } else {
     streams.push_back(stream);
   }
 
-  for (auto i = 0; i < num_columns; ++i) {
-    CUDF_FUNC_RANGE();
+  auto it = thrust::make_counting_iterator<size_type>(0);
+
+  std::transform(it, it + num_columns, result.begin(), [&](size_type i) {
     auto const& source_column = source_table.column(i);
-    destination_columns.push_back(
-      cudf::type_dispatcher<dispatch_storage_type>(source_column.type(),
-                                                   column_gatherer{},
-                                                   source_column,
-                                                   gather_map_begin,
-                                                   gather_map_end,
-                                                   bounds_policy == out_of_bounds_policy::NULLIFY,
-                                                   streams[i],
-                                                   mr));
-  }
+    return cudf::type_dispatcher<dispatch_storage_type>(
+      source_column.type(),
+      column_gatherer{},
+      source_column,
+      gather_map_begin,
+      gather_map_end,
+      bounds_policy == out_of_bounds_policy::NULLIFY,
+      streams[i],
+      mr);
+  });
 
   auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY ||
                         std::any_of(source_table.begin(), source_table.end(), [](auto const& col) {
@@ -690,14 +691,15 @@ std::unique_ptr<table> gather(table_view const& source_table,
   if (nullable) {
     auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY
                                                                    : gather_bitmask_op::DONT_CHECK;
-    gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+    gather_bitmask(source_table, gather_map_begin, result, op, stream, mr);
   }
 
-  // Join streams as late as possible so that the gather_bitmask can run on its
-  // own stream while other streams are gathering. Skip joining if only one
-  // column, since it used the passed in stream rather than forking.
+  // Join streams as late as possible so that null mask computations can run on
+  // the passed in stream while other streams are gathering. Skip joining if
+  // only one column, since it used the passed in stream rather than forking.
   if (num_columns > 1) { cudf::detail::join_streams(streams, stream); }
-  return std::make_unique<table>(std::move(destination_columns));
+
+  return std::make_unique<table>(std::move(result));
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index dbf7bfa9527..844988036ca 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -21,6 +21,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -405,22 +406,32 @@ std::unique_ptr<table> scatter(table_view const& source,
     thrust::make_transform_iterator(scatter_map_begin, index_converter<MapType>{target.num_rows()});
   auto updated_scatter_map_end =
     thrust::make_transform_iterator(scatter_map_end, index_converter<MapType>{target.num_rows()});
-  auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
-
-  std::transform(source.begin(),
-                 source.end(),
-                 target.begin(),
-                 result.begin(),
-                 [=](auto const& source_col, auto const& target_col) {
-                   return type_dispatcher<dispatch_storage_type>(source_col.type(),
-                                                                 column_scatterer{},
-                                                                 source_col,
-                                                                 updated_scatter_map_begin,
-                                                                 updated_scatter_map_end,
-                                                                 target_col,
-                                                                 stream,
-                                                                 mr);
-                 });
+
+  auto const num_columns = target.num_columns();
+  auto result            = std::vector<std::unique_ptr<column>>(num_columns);
+
+  // The data scatter for n columns will be executed over n streams. If there is
+  // only a single column, the fork/join overhead should be avoided.
+  auto streams = std::vector<rmm::cuda_stream_view>{};
+  if (num_columns > 1) {
+    streams = cudf::detail::fork_streams(stream, num_columns);
+  } else {
+    streams.push_back(stream);
+  }
+
+  auto it = thrust::make_counting_iterator<size_type>(0);
+
+  std::transform(it, it + num_columns, result.begin(), [&](size_type i) {
+    auto const& source_col = source.column(i);
+    return type_dispatcher<dispatch_storage_type>(source_col.type(),
+                                                  column_scatterer{},
+                                                  source_col,
+                                                  updated_scatter_map_begin,
+                                                  updated_scatter_map_end,
+                                                  target.column(i),
+                                                  streams[i],
+                                                  mr);
+  });
 
   // We still need to call `gather_bitmask` even when the source columns are not nullable,
   // as if the target has null_mask, that null_mask needs to be updated after scattering.
@@ -451,6 +462,12 @@ std::unique_ptr<table> scatter(table_view const& source,
       }
     });
   }
+
+  // Join streams as late as possible so that null mask computations can run on
+  // the passed in stream while other streams are scattering. Skip joining if
+  // only one column, since it used the passed in stream rather than forking.
+  if (num_columns > 1) { cudf::detail::join_streams(streams, stream); }
+
   return std::make_unique<table>(std::move(result));
 }
 }  // namespace detail