merge conflict and address review

rapidsai · Dec 4, 2024 · b4b76e4 · b4b76e4
2 parents 68f36a6 + 1b82963
commit b4b76e4
Show file tree

Hide file tree

Showing 64 changed files with 465 additions and 328 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.5
 - cupy>=12.0.0

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -91,7 +91,7 @@ requirements:
     - cudatoolkit
     - ptxcompiler >=0.7.0
     - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
     - cuda-cudart
     - libcufile  # [linux64]
@@ -100,7 +100,7 @@ requirements:
     # TODO: Add nvjitlink here
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
@@ -83,9 +83,9 @@ requirements:
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     {% endif %}
     - nvtx >=0.2.1
     - packaging

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/cccl_symbol_visibility.diff",
-          "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]",
-          "fixed_in" : "2.6"
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",

diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -1,25 +1,59 @@
 diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 2a3cc4e33..8fb337b26 100644
+index 971b93d62..0d6b25b07 100644
 --- a/thrust/thrust/system/cuda/detail/dispatch.h
 +++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -44,8 +44,7 @@
-   }                                                                                   \
-   else                                                                                \
-   {                                                                                   \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--    status                             = call arguments;                              \
+@@ -36,16 +36,15 @@
+  * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch
+  * interfaces, that always deduce the size type from the arguments.
+  */
+-#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)         \
+-  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)    \
+-  {                                                                        \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count); \
+-    status                             = call arguments;                   \
+-  }                                                                        \
+-  else                                                                     \
+-  {                                                                        \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int64_t>(count); \
+-    status                             = call arguments;                   \
++#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments)                                 \
++  if (count <= thrust::detail::integer_traits<std::int32_t>::const_max)                            \
++  {                                                                                                \
++    auto THRUST_PP_CAT2(count, _fixed) = static_cast<std::int32_t>(count);                         \
++    status                             = call arguments;                                           \
++  }                                                                                                \
++  else                                                                                             \
++  {                                                                                                \
 +    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
    }
- 
+
  /**
-@@ -66,9 +65,7 @@
-   }                                                                                          \
-   else                                                                                       \
-   {                                                                                          \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
--    status                              = call arguments;                                    \
+@@ -55,18 +54,16 @@
+  *
+  * This version of the macro supports providing two count variables, which is necessary for set algorithms.
+  */
+-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+-  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)  \
+-  {                                                                                \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);       \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);       \
+-    status                              = call arguments;                          \
+-  }                                                                                \
+-  else                                                                             \
+-  {                                                                                \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int64_t>(count1);       \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int64_t>(count2);       \
+-    status                              = call arguments;                          \
++#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments)                        \
++  if (count1 + count2 <= thrust::detail::integer_traits<std::int32_t>::const_max)                         \
++  {                                                                                                       \
++    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<std::int32_t>(count1);                              \
++    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<std::int32_t>(count2);                              \
++    status                              = call arguments;                                                 \
++  }                                                                                                       \
++  else                                                                                                    \
++  {                                                                                                       \
 +    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
    }
+
  /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -1,20 +1,20 @@
 diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index eb76ebb0b..c6c529a50 100644
+index 29510db5e..cf57e5786 100644
 --- a/cub/cub/block/block_merge_sort.cuh
 +++ b/cub/cub/block/block_merge_sort.cuh
 @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
    KeyT key1 = keys_shared[keys1_beg];
    KeyT key2 = keys_shared[keys2_beg];
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
    {
-     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-@@ -376,7 +376,7 @@ public:
+     const bool p  = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -374,7 +374,7 @@ public:
        //
        KeyT max_key = oob_default;
- 
+
 -#pragma unroll
 +#pragma unroll 1
        for (int item = 1; item < ITEMS_PER_THREAD; ++item)
@@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644
 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
  {
    constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
- 
+
 -#pragma unroll
 +#pragma unroll 1
    for (int i = 0; i < ITEMS_PER_THREAD; ++i)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
@@ -460,18 +460,18 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    */
   struct index_element_fn {
     template <typename IndexType,
-              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>)>
+              CUDF_ENABLE_IF(is_index_type<IndexType>() and std::is_signed_v<IndexType>)>
     __device__ size_type operator()(column_device_view const& indices, size_type index)
     {
       return static_cast<size_type>(indices.element<IndexType>(index));
     }
 
     template <typename IndexType,
               typename... Args,
-              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_unsigned_v<IndexType>))>
+              CUDF_ENABLE_IF(not(is_index_type<IndexType>() and std::is_signed_v<IndexType>))>
     __device__ size_type operator()(Args&&... args)
     {
-      CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type");
+      CUDF_UNREACHABLE("dictionary indices must be a signed integral type");
     }
   };
 

diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
@@ -56,15 +56,15 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
   constexpr cudf::size_type leader_lane{0};
   int const lane_id = threadIdx.x % warp_size;
 
-  cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int const warp_id         = tid / warp_size;
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const warp_id = tid / warp_size;
 
   cudf::size_type const offset         = target.offset();
   cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin);
   cudf::size_type const end_mask_idx   = cudf::word_index(offset + target_end);
 
   cudf::size_type mask_idx             = begin_mask_idx + warp_id;
-  cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size;
+  cudf::size_type const masks_per_grid = cudf::detail::grid_1d::grid_stride() / warp_size;
 
   cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin);
   cudf::size_type source_idx    = tid + target_offset;
@@ -92,7 +92,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
       }
     }
 
-    source_idx += blockDim.x * gridDim.x;
+    source_idx += cudf::detail::grid_1d::grid_stride();
     mask_idx += masks_per_grid;
   }
 

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
@@ -67,15 +67,15 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
                                       size_type source_size_bits,
                                       size_type* count_ptr)
 {
-  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const last_bit_index  = source_size_bits - 1;
   auto const last_word_index = cudf::word_index(last_bit_index);
 
   size_type thread_count = 0;
 
   for (size_type destination_word_index = tid; destination_word_index < destination.size();
-       destination_word_index += blockDim.x * gridDim.x) {
+       destination_word_index += cudf::detail::grid_1d::grid_stride()) {
     bitmask_type destination_word =
       detail::get_mask_offset_word(source[0],
                                    destination_word_index,
@@ -214,8 +214,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
-  size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  size_type range_id  = tid;
+  auto range_id = cudf::detail::grid_1d::global_thread_id();
 
   while (range_id < num_ranges) {
     size_type const first_bit_index = *(first_bit_indices + range_id);
@@ -243,7 +242,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b
     // Update the null count with the computed delta.
     size_type updated_null_count = *(null_counts + range_id) + delta;
     *(null_counts + range_id)    = updated_null_count;
-    range_id += blockDim.x * gridDim.x;
+    range_id += cudf::detail::grid_1d::grid_stride();
   }
 }
 

diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
@@ -41,7 +41,7 @@ namespace dictionary {
  *
  * The null mask and null count are copied from the input column to the output column.
  *
- * @throw cudf::logic_error if indices type is not an unsigned integer type
+ * @throw cudf::logic_error if indices type is not a signed integer type
  * @throw cudf::logic_error if the column to encode is already a DICTIONARY type
  *
  * @code{.pseudo}
@@ -58,7 +58,7 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type            = data_type{type_id::UINT32},
+  data_type indices_type            = data_type{type_id::INT32},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 

diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
@@ -217,6 +217,29 @@ constexpr inline bool is_index_type()
  */
 bool is_index_type(data_type type);
 
+/**
+ * @brief Indicates whether the type `T` is a signed numeric type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is signed numeric
+ */
+template <typename T>
+constexpr inline bool is_signed()
+{
+  return std::is_signed_v<T>;
+}
+
+/**
+ * @brief Indicates whether `type` is a signed numeric `data_type`.
+ *
+ * "Signed Numeric" types include fundamental integral types such as `INT*`
+ * but can also be `FLOAT*` types.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is signed numeric
+ */
+bool is_signed(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a unsigned numeric type.
  *

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
@@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped =
       cudf::dictionary::encode(fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end),
-                               cudf::data_type{type_id::UINT32},
+                               cudf::data_type{type_id::INT32},
                                cudf::test::get_default_stream());
   }
 
@@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
   {
     wrapped = cudf::dictionary::encode(
       fixed_width_column_wrapper<KeyElementTo, SourceElementT>(begin, end, v),
-      cudf::data_type{type_id::UINT32},
+      cudf::data_type{type_id::INT32},
       cudf::test::get_default_stream());
   }
 
@@ -1173,7 +1173,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
   dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 
@@ -1210,7 +1210,7 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
     : column_wrapper{}
   {
     wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
-                                       cudf::data_type{type_id::UINT32},
+                                       cudf::data_type{type_id::INT32},
                                        cudf::test::get_default_stream());
   }
 

diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -122,13 +122,15 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
  * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to allocate any returned objects.
  * @return An encoded column of strings.
  */
 std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group

diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
@@ -178,7 +178,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
   CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
   return make_dictionary_column(
     make_column_from_scalar(s, 1, stream, mr),
-    make_column_from_scalar(numeric_scalar<uint32_t>(0, true, stream), size, stream, mr),
+    make_column_from_scalar(numeric_scalar<int32_t>(0, true, stream), size, stream, mr),
     rmm::device_buffer{0, stream, mr},
     0);
 }