From 8fb86fba90e23f657751d372c6e08055754f907a Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Sat, 9 Mar 2024 02:14:49 +0100
Subject: [PATCH 1/6] Make vertex and edge shuffling function public

---
 cpp/include/cugraph/graph_partition_utils.cuh | 71 +++++++++++++++++++
 cpp/src/community/detail/common_methods.cuh   |  3 +-
 cpp/src/community/detail/refine_impl.cuh      |  6 +-
 cpp/src/detail/collect_local_vertex_values.cu |  3 +-
 cpp/src/detail/graph_partition_utils.cuh      | 45 ------------
 cpp/src/detail/shuffle_vertex_pairs.cu        |  3 +-
 cpp/src/detail/shuffle_vertices.cu            | 12 ++--
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh |  5 +-
 .../transform_reduce_e_by_src_dst_key.cuh     |  3 +-
 .../create_graph_from_edgelist_impl.cuh       |  5 +-
 cpp/src/structure/relabel_impl.cuh            |  3 +-
 cpp/src/structure/renumber_edgelist_impl.cuh  | 13 ++--
 cpp/src/structure/renumber_utils_impl.cuh     |  3 +-
 cpp/tests/utilities/csv_file_utilities.cu     |  3 +-
 .../utilities/matrix_market_file_utilities.cu |  5 +-
 15 files changed, 110 insertions(+), 73 deletions(-)
 create mode 100644 cpp/include/cugraph/graph_partition_utils.cuh
diff --git a/cpp/include/cugraph/graph_partition_utils.cuh b/cpp/include/cugraph/graph_partition_utils.cuh
new file mode 100644
index 0000000000..792858f92f
--- /dev/null
+++ b/cpp/include/cugraph/graph_partition_utils.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/partition_manager.hpp>
+
+#include <raft/core/device_span.hpp>
+
+#include <cuco/hash_functions.cuh>
+
+namespace cugraph {
+
+template <typename vertex_t>
+struct compute_gpu_id_from_ext_vertex_t {
+  int comm_size{0};
+  int major_comm_size{0};
+  int minor_comm_size{0};
+
+  __host__ __device__ int operator()(vertex_t v) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto vertex_partition_id = static_cast<int>(hash_func(v) % comm_size);
+    return partition_manager::compute_global_comm_rank_from_vertex_partition_id(
+      major_comm_size, minor_comm_size, vertex_partition_id);
+  }
+};
+
+template <typename vertex_t>
+struct compute_gpu_id_from_ext_edge_endpoints_t {
+  int comm_size{0};
+  int major_comm_size{0};
+  int minor_comm_size{0};
+
+  __host__ __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_vertex_partition_id = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_vertex_partition_id = static_cast<int>(hash_func(minor) % comm_size);
+    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
+    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
+    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
+      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+  }
+
+  __host__ __device__ int operator()(
+    thrust::tuple<vertex_t, vertex_t> pair /* major, minor */) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_vertex_partition_id = static_cast<int>(hash_func(thrust::get<0>(pair)) % comm_size);
+    auto minor_vertex_partition_id = static_cast<int>(hash_func(thrust::get<1>(pair)) % comm_size);
+    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
+    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
+    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
+      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+  }
+};
+
+}  // namespace cugraph
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index fe0a415db3..34c5ffcb75 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -28,6 +28,7 @@
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -278,7 +279,7 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       comm_size, major_comm_size, minor_comm_size};
 
     kv_store_t<vertex_t, weight_t, false> cluster_key_weight_map(
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index b767ce7d8b..862c6cde12 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -178,7 +178,7 @@ refine_clustering(
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       comm_size, major_comm_size, minor_comm_size};
 
     vertex_louvain_cluster_weights =
@@ -460,7 +460,7 @@ refine_clustering(
         major_comm_size,
         minor_comm_size};
 
-      // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+      // cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       //   comm_size, major_comm_size, minor_comm_size};
 
       louvain_of_leiden_keys_used_in_edge_reduction =
@@ -850,7 +850,7 @@ refine_clustering(
       major_comm_size,
       minor_comm_size};
 
-    // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    // cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
     //   comm_size, major_comm_size, minor_comm_size};
 
     lovain_of_leiden_cluster_keys =
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 2726e33b92..511346552a 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -17,6 +17,7 @@
 #include "detail/graph_partition_utils.cuh"
 
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <cuda/functional>
@@ -50,7 +51,7 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
       d_vertices.begin(),
       d_vertices.end(),
       d_values.begin(),
-      cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+      cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
         comm_size, major_comm_size, minor_comm_size},
       handle.get_stream());
   }
diff --git a/cpp/src/detail/graph_partition_utils.cuh b/cpp/src/detail/graph_partition_utils.cuh
index 957436459c..423e9383e9 100644
--- a/cpp/src/detail/graph_partition_utils.cuh
+++ b/cpp/src/detail/graph_partition_utils.cuh
@@ -35,21 +35,6 @@
 namespace cugraph {
 namespace detail {
 
-template <typename vertex_t>
-struct compute_gpu_id_from_ext_vertex_t {
-  int comm_size{0};
-  int major_comm_size{0};
-  int minor_comm_size{0};
-
-  __host__ __device__ int operator()(vertex_t v) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto vertex_partition_id = static_cast<int>(hash_func(v) % comm_size);
-    return partition_manager::compute_global_comm_rank_from_vertex_partition_id(
-      major_comm_size, minor_comm_size, vertex_partition_id);
-  }
-};
-
 template <typename vertex_t>
 struct compute_gpu_id_from_int_vertex_t {
   raft::device_span<vertex_t const> vertex_partition_range_lasts{};
@@ -91,36 +76,6 @@ struct compute_vertex_partition_id_from_int_vertex_t {
   }
 };
 
-template <typename vertex_t>
-struct compute_gpu_id_from_ext_edge_endpoints_t {
-  int comm_size{0};
-  int major_comm_size{0};
-  int minor_comm_size{0};
-
-  __host__ __device__ int operator()(vertex_t major, vertex_t minor) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto major_vertex_partition_id = static_cast<int>(hash_func(major) % comm_size);
-    auto minor_vertex_partition_id = static_cast<int>(hash_func(minor) % comm_size);
-    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
-    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
-    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
-      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-  }
-
-  __host__ __device__ int operator()(
-    thrust::tuple<vertex_t, vertex_t> pair /* major, minor */) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto major_vertex_partition_id = static_cast<int>(hash_func(thrust::get<0>(pair)) % comm_size);
-    auto minor_vertex_partition_id = static_cast<int>(hash_func(thrust::get<1>(pair)) % comm_size);
-    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
-    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
-    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
-      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-  }
-};
-
 template <typename vertex_t>
 struct compute_gpu_id_from_int_edge_endpoints_t {
   raft::device_span<vertex_t const> vertex_partition_range_lasts{};
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/detail/shuffle_vertex_pairs.cu
index 33a7834f5f..74036f890b 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/detail/shuffle_vertex_pairs.cu
@@ -17,6 +17,7 @@
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -308,7 +309,7 @@ shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
     std::move(weights),
     std::move(edge_ids),
     std::move(edge_types),
-    cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
 }
 
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index be6875f107..5bc056eb52 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -16,6 +16,7 @@
 #include "detail/graph_partition_utils.cuh"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <thrust/tuple.h>
@@ -73,11 +74,10 @@ rmm::device_uvector<vertex_t> shuffle_ext_vertices_to_local_gpu_by_vertex_partit
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  return shuffle_vertices_by_gpu_id_impl(
-    handle,
-    std::move(vertices),
-    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
-      comm_size, major_comm_size, minor_comm_size});
+  return shuffle_vertices_by_gpu_id_impl(handle,
+                                         std::move(vertices),
+                                         cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                                           comm_size, major_comm_size, minor_comm_size});
 }
 
 template <typename vertex_t, typename value_t>
@@ -97,7 +97,7 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
     handle,
     std::move(vertices),
     std::move(values),
-    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
 }
 
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 3b25ae5077..5b6331b0f2 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -23,6 +23,7 @@
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
@@ -205,7 +206,7 @@ struct reduce_with_init_t {
  * destinations assigned to this process in multi-GPU). Use  cugraph::edge_dst_property_t::view().
  * Use update_edge_dst_property to fill the wrapper.
  * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is
+ * (assigned to this process in multi-GPU, `cugraph::compute_gpu_id_from_ext_vertex_t` is
  * used to map keys to processes). (Key, value) pairs may be provided by
  * transform_reduce_by_src_key_e() or transform_reduce_by_dst_key_e().
  * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs
@@ -730,7 +731,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         collect_values_for_unique_keys(handle,
                                        kv_store_view,
                                        std::move(unique_minor_keys),
-                                       cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                                       cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                                          comm_size, major_comm_size, minor_comm_size});
 
       if constexpr (KVStoreViewType::binary_search) {
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 4220308507..60da4d1f87 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -21,6 +21,7 @@
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
@@ -564,7 +565,7 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
           tmp_keys.end(),
           get_dataframe_buffer_begin(tmp_value_buffer),
           [key_func =
-             detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+             cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
             return key_func(val);
           },
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index 1c15842982..fb409f8433 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -22,6 +22,7 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/device_comm.hpp>
@@ -148,7 +149,7 @@ void expensive_check_edgelist(raft::handle_t const& handle,
                          (*vertices).end(),
                          [comm_rank,
                           key_func =
-                            detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                            cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                               comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
                            return key_func(val) != comm_rank;
                          }) == 0,
@@ -163,7 +164,7 @@ void expensive_check_edgelist(raft::handle_t const& handle,
                        edge_first + edgelist_majors.size(),
                        [comm_rank,
                         gpu_id_key_func =
-                          detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+                          cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                             comm_size, major_comm_size, minor_comm_size}] __device__(auto e) {
                          return (gpu_id_key_func(e) != comm_rank);
                        }) == 0,
diff --git a/cpp/src/structure/relabel_impl.cuh b/cpp/src/structure/relabel_impl.cuh
index ce18ec893d..d265871b0a 100644
--- a/cpp/src/structure/relabel_impl.cuh
+++ b/cpp/src/structure/relabel_impl.cuh
@@ -20,6 +20,7 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -67,7 +68,7 @@ void relabel(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto key_func = detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    auto key_func = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
 
     // find unique old labels (to be relabeled)
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 41f81d72ab..b133ac5fe5 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -20,6 +20,7 @@
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/device_comm.hpp>
 #include <cugraph/utilities/device_functors.cuh>
@@ -78,7 +79,7 @@ template <typename vertex_t>
 struct find_unused_id_t {
   raft::device_span<vertex_t const> sorted_local_vertices{};
   size_t num_workers{};
-  compute_gpu_id_from_ext_vertex_t<vertex_t> gpu_id_op{};
+  cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> gpu_id_op{};
   int comm_rank{};
   vertex_t invalid_id{};
 
@@ -199,7 +200,7 @@ std::optional<vertex_t> find_locally_unused_ext_vertex_id(
   auto num_workers =
     std::min(static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * size_t{1024},
              sorted_local_vertices.size() + size_t{1});
-  auto gpu_id_op = compute_gpu_id_from_ext_vertex_t<vertex_t>{int{1}, int{1}, int{1}};
+  auto gpu_id_op = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{int{1}, int{1}, int{1}};
   if (multi_gpu && (handle.get_comms().get_size() > int{1})) {
     auto& comm                 = handle.get_comms();
     auto const comm_size       = comm.get_size();
@@ -207,8 +208,8 @@ std::optional<vertex_t> find_locally_unused_ext_vertex_id(
     auto const major_comm_size = major_comm.get_size();
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
-    gpu_id_op =
-      compute_gpu_id_from_ext_vertex_t<vertex_t>{comm_size, major_comm_size, minor_comm_size};
+    gpu_id_op                  = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+      comm_size, major_comm_size, minor_comm_size};
   }
   auto unused_id = thrust::transform_reduce(
     handle.get_thrust_policy(),
@@ -665,7 +666,7 @@ void expensive_check_edgelist(
            minor_comm_rank,
            i,
            gpu_id_key_func =
-             detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+             cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                comm_size, major_comm_size, minor_comm_size},
            local_edge_partition_id_key_func =
              detail::compute_local_edge_partition_id_from_ext_edge_endpoints_t<vertex_t>{
@@ -708,7 +709,7 @@ void expensive_check_edgelist(
                          (*sorted_local_vertices).end(),
                          [comm_rank,
                           key_func =
-                            detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                            cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                               comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
                            return key_func(val) != comm_rank;
                          }) == 0,
diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh
index 3efa58d963..90f3e3dff6 100644
--- a/cpp/src/structure/renumber_utils_impl.cuh
+++ b/cpp/src/structure/renumber_utils_impl.cuh
@@ -21,6 +21,7 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -405,7 +406,7 @@ void renumber_ext_vertices(raft::handle_t const& handle,
       collect_values_for_unique_keys(handle,
                                      local_renumber_map.view(),
                                      std::move(sorted_unique_ext_vertices),
-                                     detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                                     cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                                        comm_size, major_comm_size, minor_comm_size});
 
     renumber_map_ptr = std::make_unique<kv_store_t<vertex_t, vertex_t, false>>(
diff --git a/cpp/tests/utilities/csv_file_utilities.cu b/cpp/tests/utilities/csv_file_utilities.cu
index d801b18cf0..5afa4e3920 100644
--- a/cpp/tests/utilities/csv_file_utilities.cu
+++ b/cpp/tests/utilities/csv_file_utilities.cu
@@ -18,6 +18,7 @@
 #include "utilities/test_utilities.hpp"
 
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/error.hpp>
 
 #include <raft/core/handle.hpp>
@@ -219,7 +220,7 @@ read_edgelist_from_csv_file(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto edge_key_func = cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    auto edge_key_func = cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     size_t number_of_local_edges{};
     if (d_edgelist_weights) {
diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu
index b76e9fdf8c..b901dda5fd 100644
--- a/cpp/tests/utilities/matrix_market_file_utilities.cu
+++ b/cpp/tests/utilities/matrix_market_file_utilities.cu
@@ -18,6 +18,7 @@
 #include "utilities/test_utilities.hpp"
 
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/legacy/functions.hpp>
 #include <cugraph/utilities/error.hpp>
 
@@ -330,7 +331,7 @@ read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto vertex_key_func = cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    auto vertex_key_func = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     d_vertices.resize(
       thrust::distance(d_vertices.begin(),
@@ -342,7 +343,7 @@ read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
       handle.get_stream());
     d_vertices.shrink_to_fit(handle.get_stream());
 
-    auto edge_key_func = cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    auto edge_key_func = cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     size_t number_of_local_edges{};
     if (d_edgelist_weights) {

From a934252f611b352801b076746c695d9aa756c333 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Sun, 10 Mar 2024 16:07:52 +0100
Subject: [PATCH 2/6] Revert "Make vertex and edge shuffling function public"

This reverts commit 8fb86fba90e23f657751d372c6e08055754f907a.
---
 cpp/include/cugraph/graph_partition_utils.cuh | 71 -------------------
 cpp/src/community/detail/common_methods.cuh   |  3 +-
 cpp/src/community/detail/refine_impl.cuh      |  6 +-
 cpp/src/detail/collect_local_vertex_values.cu |  3 +-
 cpp/src/detail/graph_partition_utils.cuh      | 45 ++++++++++++
 cpp/src/detail/shuffle_vertex_pairs.cu        |  3 +-
 cpp/src/detail/shuffle_vertices.cu            | 12 ++--
 ...m_reduce_dst_key_aggregated_outgoing_e.cuh |  5 +-
 .../transform_reduce_e_by_src_dst_key.cuh     |  3 +-
 .../create_graph_from_edgelist_impl.cuh       |  5 +-
 cpp/src/structure/relabel_impl.cuh            |  3 +-
 cpp/src/structure/renumber_edgelist_impl.cuh  | 13 ++--
 cpp/src/structure/renumber_utils_impl.cuh     |  3 +-
 cpp/tests/utilities/csv_file_utilities.cu     |  3 +-
 .../utilities/matrix_market_file_utilities.cu |  5 +-
 15 files changed, 73 insertions(+), 110 deletions(-)
 delete mode 100644 cpp/include/cugraph/graph_partition_utils.cuh

diff --git a/cpp/include/cugraph/graph_partition_utils.cuh b/cpp/include/cugraph/graph_partition_utils.cuh
deleted file mode 100644
index 792858f92f..0000000000
--- a/cpp/include/cugraph/graph_partition_utils.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cugraph/partition_manager.hpp>
-
-#include <raft/core/device_span.hpp>
-
-#include <cuco/hash_functions.cuh>
-
-namespace cugraph {
-
-template <typename vertex_t>
-struct compute_gpu_id_from_ext_vertex_t {
-  int comm_size{0};
-  int major_comm_size{0};
-  int minor_comm_size{0};
-
-  __host__ __device__ int operator()(vertex_t v) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto vertex_partition_id = static_cast<int>(hash_func(v) % comm_size);
-    return partition_manager::compute_global_comm_rank_from_vertex_partition_id(
-      major_comm_size, minor_comm_size, vertex_partition_id);
-  }
-};
-
-template <typename vertex_t>
-struct compute_gpu_id_from_ext_edge_endpoints_t {
-  int comm_size{0};
-  int major_comm_size{0};
-  int minor_comm_size{0};
-
-  __host__ __device__ int operator()(vertex_t major, vertex_t minor) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto major_vertex_partition_id = static_cast<int>(hash_func(major) % comm_size);
-    auto minor_vertex_partition_id = static_cast<int>(hash_func(minor) % comm_size);
-    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
-    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
-    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
-      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-  }
-
-  __host__ __device__ int operator()(
-    thrust::tuple<vertex_t, vertex_t> pair /* major, minor */) const
-  {
-    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
-    auto major_vertex_partition_id = static_cast<int>(hash_func(thrust::get<0>(pair)) % comm_size);
-    auto minor_vertex_partition_id = static_cast<int>(hash_func(thrust::get<1>(pair)) % comm_size);
-    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
-    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
-    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
-      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-  }
-};
-
-}  // namespace cugraph
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index 34c5ffcb75..fe0a415db3 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -28,7 +28,6 @@
 
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -279,7 +278,7 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       comm_size, major_comm_size, minor_comm_size};
 
     kv_store_t<vertex_t, weight_t, false> cluster_key_weight_map(
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 862c6cde12..b767ce7d8b 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -178,7 +178,7 @@ refine_clustering(
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       comm_size, major_comm_size, minor_comm_size};
 
     vertex_louvain_cluster_weights =
@@ -460,7 +460,7 @@ refine_clustering(
         major_comm_size,
         minor_comm_size};
 
-      // cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+      // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
       //   comm_size, major_comm_size, minor_comm_size};
 
       louvain_of_leiden_keys_used_in_edge_reduction =
@@ -850,7 +850,7 @@ refine_clustering(
       major_comm_size,
       minor_comm_size};
 
-    // cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
+    // cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t> vertex_to_gpu_id_op{
     //   comm_size, major_comm_size, minor_comm_size};
 
     lovain_of_leiden_cluster_keys =
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 511346552a..2726e33b92 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -17,7 +17,6 @@
 #include "detail/graph_partition_utils.cuh"
 
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <cuda/functional>
@@ -51,7 +50,7 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
       d_vertices.begin(),
       d_vertices.end(),
       d_values.begin(),
-      cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+      cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
         comm_size, major_comm_size, minor_comm_size},
       handle.get_stream());
   }
diff --git a/cpp/src/detail/graph_partition_utils.cuh b/cpp/src/detail/graph_partition_utils.cuh
index 423e9383e9..957436459c 100644
--- a/cpp/src/detail/graph_partition_utils.cuh
+++ b/cpp/src/detail/graph_partition_utils.cuh
@@ -35,6 +35,21 @@
 namespace cugraph {
 namespace detail {
 
+template <typename vertex_t>
+struct compute_gpu_id_from_ext_vertex_t {
+  int comm_size{0};
+  int major_comm_size{0};
+  int minor_comm_size{0};
+
+  __host__ __device__ int operator()(vertex_t v) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto vertex_partition_id = static_cast<int>(hash_func(v) % comm_size);
+    return partition_manager::compute_global_comm_rank_from_vertex_partition_id(
+      major_comm_size, minor_comm_size, vertex_partition_id);
+  }
+};
+
 template <typename vertex_t>
 struct compute_gpu_id_from_int_vertex_t {
   raft::device_span<vertex_t const> vertex_partition_range_lasts{};
@@ -76,6 +91,36 @@ struct compute_vertex_partition_id_from_int_vertex_t {
   }
 };
 
+template <typename vertex_t>
+struct compute_gpu_id_from_ext_edge_endpoints_t {
+  int comm_size{0};
+  int major_comm_size{0};
+  int minor_comm_size{0};
+
+  __host__ __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_vertex_partition_id = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_vertex_partition_id = static_cast<int>(hash_func(minor) % comm_size);
+    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
+    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
+    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
+      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+  }
+
+  __host__ __device__ int operator()(
+    thrust::tuple<vertex_t, vertex_t> pair /* major, minor */) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_vertex_partition_id = static_cast<int>(hash_func(thrust::get<0>(pair)) % comm_size);
+    auto minor_vertex_partition_id = static_cast<int>(hash_func(thrust::get<1>(pair)) % comm_size);
+    auto major_comm_rank           = major_vertex_partition_id % major_comm_size;
+    auto minor_comm_rank           = minor_vertex_partition_id / major_comm_size;
+    return partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(
+      major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+  }
+};
+
 template <typename vertex_t>
 struct compute_gpu_id_from_int_edge_endpoints_t {
   raft::device_span<vertex_t const> vertex_partition_range_lasts{};
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/detail/shuffle_vertex_pairs.cu
index 74036f890b..33a7834f5f 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/detail/shuffle_vertex_pairs.cu
@@ -17,7 +17,6 @@
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -309,7 +308,7 @@ shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
     std::move(weights),
     std::move(edge_ids),
     std::move(edge_types),
-    cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
 }
 
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index 5bc056eb52..be6875f107 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -16,7 +16,6 @@
 #include "detail/graph_partition_utils.cuh"
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
 #include <thrust/tuple.h>
@@ -74,10 +73,11 @@ rmm::device_uvector<vertex_t> shuffle_ext_vertices_to_local_gpu_by_vertex_partit
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
   auto const minor_comm_size = minor_comm.get_size();
 
-  return shuffle_vertices_by_gpu_id_impl(handle,
-                                         std::move(vertices),
-                                         cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
-                                           comm_size, major_comm_size, minor_comm_size});
+  return shuffle_vertices_by_gpu_id_impl(
+    handle,
+    std::move(vertices),
+    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+      comm_size, major_comm_size, minor_comm_size});
 }
 
 template <typename vertex_t, typename value_t>
@@ -97,7 +97,7 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
     handle,
     std::move(vertices),
     std::move(values),
-    cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size});
 }
 
diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
index 5b6331b0f2..3b25ae5077 100644
--- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh
@@ -23,7 +23,6 @@
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
@@ -206,7 +205,7 @@ struct reduce_with_init_t {
  * destinations assigned to this process in multi-GPU). Use  cugraph::edge_dst_property_t::view().
  * Use update_edge_dst_property to fill the wrapper.
  * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU, `cugraph::compute_gpu_id_from_ext_vertex_t` is
+ * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_ext_vertex_t` is
  * used to map keys to processes). (Key, value) pairs may be provided by
  * transform_reduce_by_src_key_e() or transform_reduce_by_dst_key_e().
  * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs
@@ -731,7 +730,7 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e(
         collect_values_for_unique_keys(handle,
                                        kv_store_view,
                                        std::move(unique_minor_keys),
-                                       cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                                       cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                                          comm_size, major_comm_size, minor_comm_size});
 
       if constexpr (KVStoreViewType::binary_search) {
diff --git a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
index 60da4d1f87..4220308507 100644
--- a/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
+++ b/cpp/src/prims/transform_reduce_e_by_src_dst_key.cuh
@@ -21,7 +21,6 @@
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/dataframe_buffer.hpp>
@@ -565,7 +564,7 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle,
           tmp_keys.end(),
           get_dataframe_buffer_begin(tmp_value_buffer),
           [key_func =
-             cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+             detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
             return key_func(val);
           },
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
index fb409f8433..1c15842982 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh
@@ -22,7 +22,6 @@
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/utilities/device_comm.hpp>
@@ -149,7 +148,7 @@ void expensive_check_edgelist(raft::handle_t const& handle,
                          (*vertices).end(),
                          [comm_rank,
                           key_func =
-                            cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                            detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                               comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
                            return key_func(val) != comm_rank;
                          }) == 0,
@@ -164,7 +163,7 @@ void expensive_check_edgelist(raft::handle_t const& handle,
                        edge_first + edgelist_majors.size(),
                        [comm_rank,
                         gpu_id_key_func =
-                          cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+                          detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                             comm_size, major_comm_size, minor_comm_size}] __device__(auto e) {
                          return (gpu_id_key_func(e) != comm_rank);
                        }) == 0,
diff --git a/cpp/src/structure/relabel_impl.cuh b/cpp/src/structure/relabel_impl.cuh
index d265871b0a..ce18ec893d 100644
--- a/cpp/src/structure/relabel_impl.cuh
+++ b/cpp/src/structure/relabel_impl.cuh
@@ -20,7 +20,6 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -68,7 +67,7 @@ void relabel(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto key_func = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    auto key_func = detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
 
     // find unique old labels (to be relabeled)
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index b133ac5fe5..41f81d72ab 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -20,7 +20,6 @@
 
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/device_comm.hpp>
 #include <cugraph/utilities/device_functors.cuh>
@@ -79,7 +78,7 @@ template <typename vertex_t>
 struct find_unused_id_t {
   raft::device_span<vertex_t const> sorted_local_vertices{};
   size_t num_workers{};
-  cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t> gpu_id_op{};
+  compute_gpu_id_from_ext_vertex_t<vertex_t> gpu_id_op{};
   int comm_rank{};
   vertex_t invalid_id{};
 
@@ -200,7 +199,7 @@ std::optional<vertex_t> find_locally_unused_ext_vertex_id(
   auto num_workers =
     std::min(static_cast<size_t>(handle.get_device_properties().multiProcessorCount) * size_t{1024},
              sorted_local_vertices.size() + size_t{1});
-  auto gpu_id_op = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{int{1}, int{1}, int{1}};
+  auto gpu_id_op = compute_gpu_id_from_ext_vertex_t<vertex_t>{int{1}, int{1}, int{1}};
   if (multi_gpu && (handle.get_comms().get_size() > int{1})) {
     auto& comm                 = handle.get_comms();
     auto const comm_size       = comm.get_size();
@@ -208,8 +207,8 @@ std::optional<vertex_t> find_locally_unused_ext_vertex_id(
     auto const major_comm_size = major_comm.get_size();
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
-    gpu_id_op                  = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
-      comm_size, major_comm_size, minor_comm_size};
+    gpu_id_op =
+      compute_gpu_id_from_ext_vertex_t<vertex_t>{comm_size, major_comm_size, minor_comm_size};
   }
   auto unused_id = thrust::transform_reduce(
     handle.get_thrust_policy(),
@@ -666,7 +665,7 @@ void expensive_check_edgelist(
            minor_comm_rank,
            i,
            gpu_id_key_func =
-             cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+             detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
                comm_size, major_comm_size, minor_comm_size},
            local_edge_partition_id_key_func =
              detail::compute_local_edge_partition_id_from_ext_edge_endpoints_t<vertex_t>{
@@ -709,7 +708,7 @@ void expensive_check_edgelist(
                          (*sorted_local_vertices).end(),
                          [comm_rank,
                           key_func =
-                            cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                            detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                               comm_size, major_comm_size, minor_comm_size}] __device__(auto val) {
                            return key_func(val) != comm_rank;
                          }) == 0,
diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh
index 90f3e3dff6..3efa58d963 100644
--- a/cpp/src/structure/renumber_utils_impl.cuh
+++ b/cpp/src/structure/renumber_utils_impl.cuh
@@ -21,7 +21,6 @@
 
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
@@ -406,7 +405,7 @@ void renumber_ext_vertices(raft::handle_t const& handle,
       collect_values_for_unique_keys(handle,
                                      local_renumber_map.view(),
                                      std::move(sorted_unique_ext_vertices),
-                                     cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+                                     detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
                                        comm_size, major_comm_size, minor_comm_size});
 
     renumber_map_ptr = std::make_unique<kv_store_t<vertex_t, vertex_t, false>>(
diff --git a/cpp/tests/utilities/csv_file_utilities.cu b/cpp/tests/utilities/csv_file_utilities.cu
index 5afa4e3920..d801b18cf0 100644
--- a/cpp/tests/utilities/csv_file_utilities.cu
+++ b/cpp/tests/utilities/csv_file_utilities.cu
@@ -18,7 +18,6 @@
 #include "utilities/test_utilities.hpp"
 
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/utilities/error.hpp>
 
 #include <raft/core/handle.hpp>
@@ -220,7 +219,7 @@ read_edgelist_from_csv_file(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto edge_key_func = cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    auto edge_key_func = cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     size_t number_of_local_edges{};
     if (d_edgelist_weights) {
diff --git a/cpp/tests/utilities/matrix_market_file_utilities.cu b/cpp/tests/utilities/matrix_market_file_utilities.cu
index b901dda5fd..b76e9fdf8c 100644
--- a/cpp/tests/utilities/matrix_market_file_utilities.cu
+++ b/cpp/tests/utilities/matrix_market_file_utilities.cu
@@ -18,7 +18,6 @@
 #include "utilities/test_utilities.hpp"
 
 #include <cugraph/graph_functions.hpp>
-#include <cugraph/graph_partition_utils.cuh>
 #include <cugraph/legacy/functions.hpp>
 #include <cugraph/utilities/error.hpp>
 
@@ -331,7 +330,7 @@ read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
     auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
     auto const minor_comm_size = minor_comm.get_size();
 
-    auto vertex_key_func = cugraph::compute_gpu_id_from_ext_vertex_t<vertex_t>{
+    auto vertex_key_func = cugraph::detail::compute_gpu_id_from_ext_vertex_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     d_vertices.resize(
       thrust::distance(d_vertices.begin(),
@@ -343,7 +342,7 @@ read_edgelist_from_matrix_market_file(raft::handle_t const& handle,
       handle.get_stream());
     d_vertices.shrink_to_fit(handle.get_stream());
 
-    auto edge_key_func = cugraph::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
+    auto edge_key_func = cugraph::detail::compute_gpu_id_from_ext_edge_endpoints_t<vertex_t>{
       comm_size, major_comm_size, minor_comm_size};
     size_t number_of_local_edges{};
     if (d_edgelist_weights) {

From 29e71b05e0582a12b2cb61f9afd4b4e394c2bdd1 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 11 Mar 2024 04:21:20 +0100
Subject: [PATCH 3/6] Make external vertex and edge shuffling function public

---
 cpp/include/cugraph/graph_functions.hpp | 56 ++++++++++++++++++++
 cpp/src/detail/shuffle_vertex_pairs.cu  | 65 +++++++++++++++++++++++
 cpp/src/detail/shuffle_vertices.cu      | 70 +++++++++++++++++++++++++
 3 files changed, 191 insertions(+)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 90425f86be..6fd87f1192 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1052,4 +1052,60 @@ remove_multi_edges(raft::handle_t const& handle,
                    std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
                    bool keep_min_value_edge = false);
 
+/**
+ * @brief Shuffle external vertex ids and values to the proper GPU.
+ * *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ * @tparam value_t     Type of values. Needs to an integral or floating point type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @param values List of values
+ * @return Tuple of vectors storing vertex ids and values mapped to this GPU.
+ */
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<vertex_t>&& vertices,
+                                     rmm::device_uvector<value_t>&& values);
+
+/**
+ * @brief Shuffle external vertex ids to the proper GPU.
+ * *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param vertices  List of vertex ids
+ * @return Vector of vertex ids mapped to this GPU.
+ */
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices);
+
+/**
+ * @brief Shuffle external edges to the proper GPU.
+ * *
+ * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t      Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t    Type of edge weight. Currently float and double are supported.
+ *
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param edge_srcs  List of source vertex ids
+ * @param edge_dsts  List of destination vertex ids
+ * @param edge_weights  Optional list of edge weights
+ * @return Tuple of vectors storing edge sources, destinations and optional weights mapped to this
+ * GPU.
+ */
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights);
+
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/detail/shuffle_vertex_pairs.cu
index 33a7834f5f..62e09ad93d 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/detail/shuffle_vertex_pairs.cu
@@ -519,4 +519,69 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
   std::vector<int64_t> const& vertex_partition_range_lasts);
 
 }  // namespace detail
+
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<vertex_t>&& edge_srcs,
+                       rmm::device_uvector<vertex_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights)
+{
+  auto& comm                 = handle.get_comms();
+  auto const comm_size       = comm.get_size();
+  auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+  auto const major_comm_size = major_comm.get_size();
+  auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+
+  auto const minor_comm_size = minor_comm.get_size();
+
+  std::tie(edge_srcs, edge_dsts, edge_weights, std::ignore, std::ignore) =
+    cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+      vertex_t,
+      vertex_t,
+      weight_t,
+      int32_t>(handle,
+               std::move(edge_srcs),
+               std::move(edge_dsts),
+               std::move(edge_weights),
+               std::nullopt,
+               std::nullopt);
+
+  return std::make_tuple(std::move(edge_srcs), std::move(edge_dsts), std::move(edge_weights));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& edge_srcs,
+                       rmm::device_uvector<int32_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<float>>&& edge_wgts);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& edge_srcs,
+                       rmm::device_uvector<int32_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<double>>&& edge_wgts);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& edge_srcs,
+                       rmm::device_uvector<int64_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<float>>&& edge_wgts);
+
+template std::tuple<rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int64_t>&& edge_srcs,
+                       rmm::device_uvector<int64_t>&& edge_dsts,
+                       std::optional<rmm::device_uvector<double>>&& edge_wgts);
+
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index be6875f107..c01251b134 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -249,4 +249,74 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<double>&& values);
 
 }  // namespace detail
+
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<vertex_t>&& vertices,
+                                     rmm::device_uvector<value_t>&& values)
+{
+  return detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+    handle, std::move(vertices), std::move(values));
+}
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int32_t>&& vertices,
+                                     rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int32_t>&& vertices,
+                                     rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int32_t>&& vertices,
+                                     rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int32_t>&& vertices,
+                                     rmm::device_uvector<double>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int64_t>&& vertices,
+                                     rmm::device_uvector<int32_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int64_t>&& vertices,
+                                     rmm::device_uvector<int64_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int64_t>&& vertices,
+                                     rmm::device_uvector<size_t>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int64_t>&& vertices,
+                                     rmm::device_uvector<float>&& values);
+
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
+shuffle_external_vertices_and_values(raft::handle_t const& handle,
+                                     rmm::device_uvector<int64_t>&& vertices,
+                                     rmm::device_uvector<double>&& values);
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices)
+{
+  return detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle,
+                                                                          std::move(vertices));
+}
+
+template rmm::device_uvector<int32_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int32_t>&& d_vertices);
+
+template rmm::device_uvector<int64_t> shuffle_external_vertices(
+  raft::handle_t const& handle, rmm::device_uvector<int64_t>&& d_vertices);
+
 }  // namespace cugraph

From 4a9ad9977c2243948c4401b658c63c2d31c599b5 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 11 Mar 2024 21:52:09 +0100
Subject: [PATCH 4/6] Address PR comments part-1

---
 cpp/include/cugraph/graph_functions.hpp |  44 +++++-----
 cpp/src/detail/shuffle_vertex_pairs.cu  | 103 ++++++++++++++++--------
 cpp/src/detail/shuffle_vertices.cu      |  60 +++++++-------
 3 files changed, 125 insertions(+), 82 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 6fd87f1192..5229e536dd 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1053,36 +1053,36 @@ remove_multi_edges(raft::handle_t const& handle,
                    bool keep_min_value_edge = false);
 
 /**
- * @brief Shuffle external vertex ids and values to the proper GPU.
+ * @brief Shuffle external vertex ids to the proper GPU.
  * *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
- * @tparam value_t     Type of values. Needs to an integral or floating point type.
  *
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param vertices  List of vertex ids
- * @param values List of values
- * @return Tuple of vectors storing vertex ids and values mapped to this GPU.
+ * @return Vector of vertex ids mapped to this GPU.
  */
-template <typename vertex_t, typename value_t>
-std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<vertex_t>&& vertices,
-                                     rmm::device_uvector<value_t>&& values);
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
+                                                        rmm::device_uvector<vertex_t>&& vertices);
 
 /**
- * @brief Shuffle external vertex ids to the proper GPU.
+ * @brief Shuffle external vertex ids and values to the proper GPU.
  * *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
+ * @tparam value_t     Type of values. Needs to an integral or floating point type.
  *
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param vertices  List of vertex ids
- * @return Vector of vertex ids mapped to this GPU.
+ * @param values List of values
+ * @return Tuple of vectors storing vertex ids and values mapped to this GPU.
  */
-template <typename vertex_t>
-rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,
-                                                        rmm::device_uvector<vertex_t>&& vertices);
+template <typename vertex_t, typename value_t>
+std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values);
 
 /**
  * @brief Shuffle external edges to the proper GPU.
@@ -1096,16 +1096,22 @@ rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& ha
  * @param edge_srcs  List of source vertex ids
  * @param edge_dsts  List of destination vertex ids
  * @param edge_weights  Optional list of edge weights
- * @return Tuple of vectors storing edge sources, destinations and optional weights mapped to this
- * GPU.
+ * @param edge_ids  Optional list of edge ids
+ * @param edge_types Optional list of edge types
+ * @return Tuple of vectors storing edge sources, destinations, optional weights,
+ *          optional edge ids, optional edge types mapped to this GPU.
  */
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights);
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types);
 
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/detail/shuffle_vertex_pairs.cu
index 62e09ad93d..b473796aa9 100644
--- a/cpp/src/detail/shuffle_vertex_pairs.cu
+++ b/cpp/src/detail/shuffle_vertex_pairs.cu
@@ -520,68 +520,105 @@ shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
 
 }  // namespace detail
 
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename edge_type_t>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
-           std::optional<rmm::device_uvector<weight_t>>>
+           std::optional<rmm::device_uvector<weight_t>>,
+           std::optional<rmm::device_uvector<edge_t>>,
+           std::optional<rmm::device_uvector<edge_type_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
                        rmm::device_uvector<vertex_t>&& edge_srcs,
                        rmm::device_uvector<vertex_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights)
+                       std::optional<rmm::device_uvector<weight_t>>&& edge_weights,
+                       std::optional<rmm::device_uvector<edge_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<edge_type_t>>&& edge_types)
 {
   auto& comm                 = handle.get_comms();
   auto const comm_size       = comm.get_size();
   auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
   auto const major_comm_size = major_comm.get_size();
   auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-
   auto const minor_comm_size = minor_comm.get_size();
 
-  std::tie(edge_srcs, edge_dsts, edge_weights, std::ignore, std::ignore) =
-    cugraph::detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
-      vertex_t,
-      vertex_t,
-      weight_t,
-      int32_t>(handle,
-               std::move(edge_srcs),
-               std::move(edge_dsts),
-               std::move(edge_weights),
-               std::nullopt,
-               std::nullopt);
-
-  return std::make_tuple(std::move(edge_srcs), std::move(edge_dsts), std::move(edge_weights));
+  return detail::shuffle_ext_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning(
+    handle,
+    std::move(edge_srcs),
+    std::move(edge_dsts),
+    std::move(edge_weights),
+    std::move(edge_ids),
+    std::move(edge_types));
 }
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
-                       rmm::device_uvector<int32_t>&& edge_srcs,
-                       rmm::device_uvector<int32_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<float>>&& edge_wgts);
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int32_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
-                       rmm::device_uvector<int32_t>&& edge_srcs,
-                       rmm::device_uvector<int32_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<double>>&& edge_wgts);
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
+
+template std::tuple<rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
+shuffle_external_edges(raft::handle_t const& handle,
+                       rmm::device_uvector<int32_t>&& majors,
+                       rmm::device_uvector<int32_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<float>>>
+                    std::optional<rmm::device_uvector<float>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
-                       rmm::device_uvector<int64_t>&& edge_srcs,
-                       rmm::device_uvector<int64_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<float>>&& edge_wgts);
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<float>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
-                    std::optional<rmm::device_uvector<double>>>
+                    std::optional<rmm::device_uvector<double>>,
+                    std::optional<rmm::device_uvector<int64_t>>,
+                    std::optional<rmm::device_uvector<int32_t>>>
 shuffle_external_edges(raft::handle_t const& handle,
-                       rmm::device_uvector<int64_t>&& edge_srcs,
-                       rmm::device_uvector<int64_t>&& edge_dsts,
-                       std::optional<rmm::device_uvector<double>>&& edge_wgts);
+                       rmm::device_uvector<int64_t>&& majors,
+                       rmm::device_uvector<int64_t>&& minors,
+                       std::optional<rmm::device_uvector<double>>&& weights,
+                       std::optional<rmm::device_uvector<int64_t>>&& edge_ids,
+                       std::optional<rmm::device_uvector<int32_t>>&& edge_types);
 
 }  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index c01251b134..b396201f50 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -252,58 +252,58 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
 
 template <typename vertex_t, typename value_t>
 std::tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<value_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<vertex_t>&& vertices,
-                                     rmm::device_uvector<value_t>&& values)
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<vertex_t>&& vertices,
+                                    rmm::device_uvector<value_t>&& values)
 {
   return detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
     handle, std::move(vertices), std::move(values));
 }
 
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int32_t>&& vertices,
-                                     rmm::device_uvector<int32_t>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
 
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int32_t>&& vertices,
-                                     rmm::device_uvector<size_t>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
 
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int32_t>&& vertices,
-                                     rmm::device_uvector<float>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
 
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int32_t>&& vertices,
-                                     rmm::device_uvector<double>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int32_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int32_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int64_t>&& vertices,
-                                     rmm::device_uvector<int32_t>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int32_t>&& values);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int64_t>&& vertices,
-                                     rmm::device_uvector<int64_t>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<int64_t>&& values);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int64_t>&& vertices,
-                                     rmm::device_uvector<size_t>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<size_t>&& values);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int64_t>&& vertices,
-                                     rmm::device_uvector<float>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<float>&& values);
 
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>>
-shuffle_external_vertices_and_values(raft::handle_t const& handle,
-                                     rmm::device_uvector<int64_t>&& vertices,
-                                     rmm::device_uvector<double>&& values);
+shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
+                                    rmm::device_uvector<int64_t>&& vertices,
+                                    rmm::device_uvector<double>&& values);
 
 template <typename vertex_t>
 rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& handle,

From 289bcbadabe1108b044cc1d9528ee85e92489cb9 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 11 Mar 2024 22:13:58 +0100
Subject: [PATCH 5/6] Move shuffle_vertex_pairs.cu and shuffle_vertices.cu to
 utilities

---
 cpp/CMakeLists.txt                                    | 4 ++--
 cpp/src/{detail => utilities}/shuffle_vertex_pairs.cu | 0
 cpp/src/{detail => utilities}/shuffle_vertices.cu     | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename cpp/src/{detail => utilities}/shuffle_vertex_pairs.cu (100%)
 rename cpp/src/{detail => utilities}/shuffle_vertices.cu (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a3392627fb..3131404712 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -186,9 +186,9 @@ endif()
 #       which should give us a better parallel schedule.
 
 set(CUGRAPH_SOURCES
-    src/detail/shuffle_vertices.cu
+    src/utilities/shuffle_vertices.cu
     src/detail/permute_range.cu
-    src/detail/shuffle_vertex_pairs.cu
+    src/utilities/shuffle_vertex_pairs.cu
     src/detail/collect_local_vertex_values.cu
     src/detail/groupby_and_count.cu
     src/detail/collect_comm_wrapper.cu
diff --git a/cpp/src/detail/shuffle_vertex_pairs.cu b/cpp/src/utilities/shuffle_vertex_pairs.cu
similarity index 100%
rename from cpp/src/detail/shuffle_vertex_pairs.cu
rename to cpp/src/utilities/shuffle_vertex_pairs.cu
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/utilities/shuffle_vertices.cu
similarity index 100%
rename from cpp/src/detail/shuffle_vertices.cu
rename to cpp/src/utilities/shuffle_vertices.cu

From aaba770445351ed1a409142de33cfa8d4abe9f25 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 11 Mar 2024 22:28:38 +0100
Subject: [PATCH 6/6] Fix doc string

---
 cpp/include/cugraph/graph_functions.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 5229e536dd..6d4470e825 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1054,7 +1054,7 @@ remove_multi_edges(raft::handle_t const& handle,
 
 /**
  * @brief Shuffle external vertex ids to the proper GPU.
- * *
+ *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
  *
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
@@ -1068,9 +1068,10 @@ rmm::device_uvector<vertex_t> shuffle_external_vertices(raft::handle_t const& ha
 
 /**
  * @brief Shuffle external vertex ids and values to the proper GPU.
- * *
- * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
- * @tparam value_t     Type of values. Needs to an integral or floating point type.
+ *
+ * @tparam vertex_t   Type of vertex identifiers. Needs to be an integral type.
+ * @tparam value_t    Type of values. currently supported types are int32_t,
+ * int64_t, size_t, float and double.
  *
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
@@ -1086,7 +1087,7 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle,
 
 /**
  * @brief Shuffle external edges to the proper GPU.
- * *
+ *
  * @tparam vertex_t    Type of vertex identifiers. Needs to be an integral type.
  * @tparam edge_t      Type of edge identifiers. Needs to be an integral type.
  * @tparam weight_t    Type of edge weight. Currently float and double are supported.