Merge remote-tracking branch 'upstream/branch-25.02' into cudf/_lib/t…

…ypes/pt1
rapidsai · Dec 19, 2024 · 4a47a33 · 4a47a33
2 parents 23f7991 + 989fac4
commit 4a47a33
Show file tree

Hide file tree

Showing 75 changed files with 1,694 additions and 969 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -13,6 +13,7 @@ jobs:
   # Please keep pr-builder as the top job here
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -54,6 +55,18 @@ jobs:
       - name: Telemetry setup
         if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: cudf
   changed-files:
     secrets: inherit
     needs: telemetry-setup
@@ -328,16 +341,11 @@ jobs:
         run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
 
   telemetry-summarize:
-    runs-on: ubuntu-latest
+    # This job must use a self-hosted runner to record telemetry traces.
+    runs-on: linux-amd64-cpu4
     needs: pr-builder
     if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
-      - name: Load stashed telemetry env vars
-        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
-        with:
-            load_service_name: true
       - name: Telemetry summarize
-        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
-        with:
-          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
+        uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,11 +14,6 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
-
   if(NOT BUILD_SHARED_LIBS)
     set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
   else()
@@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow)
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
+    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)

diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,7 +32,6 @@
 #include <thrust/fill.h>
 
 #include <type_traits>
-#include <vector>
 
 namespace cudf {
 namespace detail {
@@ -216,12 +216,12 @@ struct identity_initializer {
  * @throw cudf::logic_error if column type is not fixed-width
  *
  * @param table The table of columns to initialize.
- * @param aggs A vector of aggregation operations corresponding to the table
+ * @param aggs A span of aggregation operations corresponding to the table
  * columns. The aggregations determine the identity value for each column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream);
 
 }  // namespace detail

diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -57,62 +57,71 @@ struct MurmurHash3_x86_32 {
 };
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<bool>::operator()(bool const& key) const
+MurmurHash3_x86_32<bool>::result_type __device__ inline MurmurHash3_x86_32<bool>::operator()(
+  bool const& key) const
 {
   return this->compute(static_cast<uint8_t>(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<float>::operator()(float const& key) const
+MurmurHash3_x86_32<float>::result_type __device__ inline MurmurHash3_x86_32<float>::operator()(
+  float const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<double>::operator()(double const& key) const
+MurmurHash3_x86_32<double>::result_type __device__ inline MurmurHash3_x86_32<double>::operator()(
+  double const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
+MurmurHash3_x86_32<cudf::string_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
+    cudf::string_view const& key) const
 {
   return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
                              key.size_bytes());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
+MurmurHash3_x86_32<numeric::decimal32>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
+    numeric::decimal32 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
+MurmurHash3_x86_32<numeric::decimal64>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
+    numeric::decimal64 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
+MurmurHash3_x86_32<numeric::decimal128>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
+    numeric::decimal128 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
-  cudf::list_view const& key) const
+MurmurHash3_x86_32<cudf::list_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
+    cudf::list_view const& key) const
 {
   CUDF_UNREACHABLE("List column hashing is not supported");
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
-  cudf::struct_view const& key) const
+MurmurHash3_x86_32<cudf::struct_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
+    cudf::struct_view const& key) const
 {
   CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
 }

diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
@@ -17,15 +17,14 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <vector>
-
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream)
 {
   // TODO: Initialize all the columns in a single kernel instead of invoking one

diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -60,7 +60,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   rmm::cuda_stream_view stream)
 {
   // flatten the aggs to a table that can be operated on by aggregate_row
-  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream);
   auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 

diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,

diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -25,6 +25,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -44,7 +45,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,

diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -19,6 +19,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -34,7 +35,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,

diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullabl
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   nullable_global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,

diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
@@ -61,7 +61,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
     d_row_equal,
     probing_scheme_t{d_row_hash},
     cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cuco::storage<GROUPBY_BUCKET_SIZE>{},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 

diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
-  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+  __shared__ cuco::bucket<cudf::size_type, GROUPBY_BUCKET_SIZE> buckets[bucket_extent.value()];
 
   auto raw_set = cuco::static_set_ref{
     cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     global_set.key_eq(),
     probing_scheme_t{global_set.hash_function()},
     cuco::thread_scope_block,
-    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
-      window_extent, windows}};
+    cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, decltype(bucket_extent)>{
+      bucket_extent, buckets}};
   auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
 
   auto const block = cooperative_groups::this_thread_block();

diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        host_span<cudf::aggregation::Kind const> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -107,7 +108,7 @@ template void extract_populated_keys<nullable_global_set_t>(
 template cudf::table create_sparse_results_table<global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table<global_set_t>(
 template cudf::table create_sparse_results_table<nullable_global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   nullable_global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,