Split up scan_inclusive.cu to improve its compile time (#14358)

Splits out the `strings` and `struct` specializations in `scan_inclusive.cu` into separate source files to improve compile time. Each specialization is unique code with limited aggregation types. No functional changes. Just code moved around. Found while working on #14234 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: #14358
rapidsai · Nov 9, 2023 · 7da0336 · 7da0336
1 parent c4e6c09
commit 7da0336
Show file tree

Hide file tree

Showing 6 changed files with 319 additions and 108 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -582,6 +582,7 @@ add_library(
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
   src/strings/reverse.cu
+  src/strings/scan/scan_inclusive.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
@@ -598,6 +599,7 @@ add_library(
   src/strings/utilities.cu
   src/strings/wrap.cu
   src/structs/copying/concatenate.cu
+  src/structs/scan/scan_inclusive.cu
   src/structs/structs_column_factories.cu
   src/structs/structs_column_view.cpp
   src/structs/utilities.cpp

diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+/**
+ * @brief Scan function for strings
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
+ *
+ * @param input Input strings column
+ * @param mask Mask for scan
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace structs {
+namespace detail {
+/**
+ * @brief Scan function for struct column type
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
+ *
+ * @param input Input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New struct column
+ */
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace structs
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <reductions/nested_type_minmax_util.cuh>
 #include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -25,9 +24,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/strings/detail/scan.hpp>
+#include <cudf/structs/detail/scan.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/find.h>
@@ -68,43 +68,6 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view
 
 namespace {
 
-/**
- * @brief Min/Max inclusive scan operator
- *
- * This operator will accept index values, check them and then
- * run the `Op` operation on the individual element objects.
- * The returned result is the appropriate index value.
- *
- * This was specifically created to workaround a thrust issue
- * https://github.com/NVIDIA/thrust/issues/1479
- * where invalid values are passed to the operator.
- */
-template <typename Element, typename Op>
-struct min_max_scan_operator {
-  column_device_view const col;      ///< strings column device view
-  Element const null_replacement{};  ///< value used when element is null
-  bool const has_nulls;              ///< true if col has null elements
-
-  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
-    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
-  {
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
-    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
-  }
-
-  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
-  {
-    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
-    // in these cases the return value does not matter since the result is not used
-    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
-    Element d_lhs =
-      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
-    Element d_rhs =
-      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
-    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
-  }
-};
-
 template <typename Op, typename T>
 struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
@@ -127,50 +90,14 @@ struct scan_functor {
   }
 };
 
-struct null_iterator {
-  bitmask_type const* mask;
-  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
-};
-
 template <typename Op>
 struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const* mask,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    auto d_input = column_device_view::create(input_view, stream);
-
-    // build indices of the scan operation results
-    rmm::device_uvector<size_type> result_map(input_view.size(), stream);
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<size_type>(0),
-      thrust::counting_iterator<size_type>(input_view.size()),
-      result_map.begin(),
-      min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});
-
-    if (input_view.has_nulls()) {
-      // fill the null rows with out-of-bounds values so gather records them as null;
-      // this prevents un-sanitized null entries in the output
-      auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask});
-      auto oob_val  = thrust::constant_iterator<size_type>(input_view.size());
-      thrust::scatter_if(rmm::exec_policy(stream),
-                         oob_val,
-                         oob_val + input_view.size(),
-                         thrust::counting_iterator<size_type>(0),
-                         null_itr,
-                         result_map.data());
-    }
-
-    // call gather using the indices to build the output column
-    auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
-                                             result_map,
-                                             out_of_bounds_policy::NULLIFY,
-                                             negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-    return std::move(result_table->release().front());
+    return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
 };
 
@@ -181,38 +108,7 @@ struct scan_functor<Op, cudf::struct_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    // Create a gather map containing indices of the prefix min/max elements.
-    auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
-    auto const binop_generator =
-      cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           thrust::counting_iterator<size_type>(0),
-                           thrust::counting_iterator<size_type>(input.size()),
-                           gather_map.begin(),
-                           binop_generator.binop());
-
-    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
-    // handle input in case it is a sliced view.
-    auto const input_children = [&] {
-      auto const it = cudf::detail::make_counting_transform_iterator(
-        0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
-          return structs_view.get_sliced_child(child_idx, stream);
-        });
-      return std::vector<column_view>(it, it + input.num_children());
-    }();
-
-    // Gather the children elements of the prefix min/max struct elements for the output.
-    auto scanned_children = cudf::detail::gather(table_view{input_children},
-                                                 gather_map,
-                                                 out_of_bounds_policy::DONT_CHECK,
-                                                 negative_index_policy::NOT_ALLOWED,
-                                                 stream,
-                                                 mr)
-                              ->release();
-
-    // Don't need to set a null mask because that will be handled at the caller.
-    return make_structs_column(
-      input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+    return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
 };
 

diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Min/Max inclusive scan operator
+ *
+ * This operator will accept index values, check them and then
+ * run the `Op` operation on the individual element objects.
+ * The returned result is the appropriate index value.
+ *
+ * This was specifically created to workaround a thrust issue
+ * https://github.com/NVIDIA/thrust/issues/1479
+ * where invalid values are passed to the operator.
+ */
+template <typename Element, typename Op>
+struct min_max_scan_operator {
+  column_device_view const col;      ///< strings column device view
+  Element const null_replacement{};  ///< value used when element is null
+  bool const has_nulls;              ///< true if col has null elements
+
+  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
+    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
+  {
+    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
+    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
+  }
+
+  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
+  {
+    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
+    // in these cases the return value does not matter since the result is not used
+    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
+    Element d_lhs =
+      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
+    Element d_rhs =
+      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
+    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
+  }
+};
+
+struct null_iterator {
+  bitmask_type const* mask;
+  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
+};
+
+}  // namespace
+
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto d_input = column_device_view::create(input, stream);
+
+  // build indices of the scan operation results
+  rmm::device_uvector<size_type> result_map(input.size(), stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         thrust::counting_iterator<size_type>(input.size()),
+                         result_map.begin(),
+                         min_max_scan_operator<cudf::string_view, Op>{*d_input, input.has_nulls()});
+
+  if (input.has_nulls()) {
+    // fill the null rows with out-of-bounds values so gather records them as null;
+    // this prevents un-sanitized null entries in the output
+    auto null_itr = cudf::detail::make_counting_transform_iterator(0, null_iterator{mask});
+    auto oob_val  = thrust::constant_iterator<size_type>(input.size());
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       oob_val,
+                       oob_val + input.size(),
+                       thrust::counting_iterator<size_type>(0),
+                       null_itr,
+                       result_map.data());
+  }
+
+  // call gather using the indices to build the output column
+  auto result_table = cudf::detail::gather(cudf::table_view({input}),
+                                           result_map,
+                                           cudf::out_of_bounds_policy::NULLIFY,
+                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                           stream,
+                                           mr);
+  return std::move(result_table->release().front());
+}
+
+template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf