Skip to content

Commit

Permalink
Split up scan_inclusive.cu to improve its compile time (#14358)
Browse files Browse the repository at this point in the history
Splits out the `strings` and `struct` specializations in `scan_inclusive.cu` into separate source files to improve compile time.
Each specialization is unique code with limited aggregation types.
No functional changes. Just code moved around.
Found while working on #14234

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: #14358
  • Loading branch information
davidwendt authored Nov 9, 2023
1 parent c4e6c09 commit 7da0336
Show file tree
Hide file tree
Showing 6 changed files with 319 additions and 108 deletions.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ add_library(
src/strings/replace/replace.cu
src/strings/replace/replace_re.cu
src/strings/reverse.cu
src/strings/scan/scan_inclusive.cu
src/strings/search/findall.cu
src/strings/search/find.cu
src/strings/search/find_multiple.cu
Expand All @@ -598,6 +599,7 @@ add_library(
src/strings/utilities.cu
src/strings/wrap.cu
src/structs/copying/concatenate.cu
src/structs/scan/scan_inclusive.cu
src/structs/structs_column_factories.cu
src/structs/structs_column_view.cpp
src/structs/utilities.cpp
Expand Down
47 changes: 47 additions & 0 deletions cpp/include/cudf/strings/detail/scan.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace strings {
namespace detail {
/**
* @brief Scan function for strings
*
* Called by cudf::scan() with only min and max aggregates.
*
* @tparam Op Either DeviceMin or DeviceMax operations
*
* @param input Input strings column
* @param mask Mask for scan
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
template <typename Op>
std::unique_ptr<column> scan_inclusive(column_view const& input,
bitmask_type const* mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace strings
} // namespace cudf
45 changes: 45 additions & 0 deletions cpp/include/cudf/structs/detail/scan.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace structs {
namespace detail {
/**
* @brief Scan function for struct column type
*
* Called by cudf::scan() with only min and max aggregates.
*
* @tparam Op Either DeviceMin or DeviceMax operations
*
* @param input Input column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New struct column
*/
template <typename Op>
std::unique_ptr<column> scan_inclusive(column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace structs
} // namespace cudf
112 changes: 4 additions & 108 deletions cpp/src/reductions/scan/scan_inclusive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
* limitations under the License.
*/

#include <reductions/nested_type_minmax_util.cuh>
#include <reductions/scan/scan.cuh>

#include <cudf/column/column_device_view.cuh>
Expand All @@ -25,9 +24,10 @@
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/structs/utilities.hpp>
#include <cudf/reduction.hpp>
#include <cudf/strings/detail/scan.hpp>
#include <cudf/structs/detail/scan.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/find.h>
Expand Down Expand Up @@ -68,43 +68,6 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view

namespace {

/**
* @brief Min/Max inclusive scan operator
*
* This operator will accept index values, check them and then
* run the `Op` operation on the individual element objects.
* The returned result is the appropriate index value.
*
* This was specifically created to workaround a thrust issue
* https://github.com/NVIDIA/thrust/issues/1479
* where invalid values are passed to the operator.
*/
template <typename Element, typename Op>
struct min_max_scan_operator {
column_device_view const col; ///< strings column device view
Element const null_replacement{}; ///< value used when element is null
bool const has_nulls; ///< true if col has null elements

min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
: col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
{
// verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
}

__device__ inline size_type operator()(size_type lhs, size_type rhs) const
{
// thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
// in these cases the return value does not matter since the result is not used
if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
Element d_lhs =
has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
Element d_rhs =
has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
}
};

template <typename Op, typename T>
struct scan_functor {
static std::unique_ptr<column> invoke(column_view const& input_view,
Expand All @@ -127,50 +90,14 @@ struct scan_functor {
}
};

struct null_iterator {
bitmask_type const* mask;
__device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
};

template <typename Op>
struct scan_functor<Op, cudf::string_view> {
static std::unique_ptr<column> invoke(column_view const& input_view,
bitmask_type const* mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto d_input = column_device_view::create(input_view, stream);

// build indices of the scan operation results
rmm::device_uvector<size_type> result_map(input_view.size(), stream);
thrust::inclusive_scan(
rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(input_view.size()),
result_map.begin(),
min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});

if (input_view.has_nulls()) {
// fill the null rows with out-of-bounds values so gather records them as null;
// this prevents un-sanitized null entries in the output
auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask});
auto oob_val = thrust::constant_iterator<size_type>(input_view.size());
thrust::scatter_if(rmm::exec_policy(stream),
oob_val,
oob_val + input_view.size(),
thrust::counting_iterator<size_type>(0),
null_itr,
result_map.data());
}

// call gather using the indices to build the output column
auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
result_map,
out_of_bounds_policy::NULLIFY,
negative_index_policy::NOT_ALLOWED,
stream,
mr);
return std::move(result_table->release().front());
return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
}
};

Expand All @@ -181,38 +108,7 @@ struct scan_functor<Op, cudf::struct_view> {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Create a gather map containing indices of the prefix min/max elements.
auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
auto const binop_generator =
cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
thrust::inclusive_scan(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(input.size()),
gather_map.begin(),
binop_generator.binop());

// Gather the children columns of the input column. Must use `get_sliced_child` to properly
// handle input in case it is a sliced view.
auto const input_children = [&] {
auto const it = cudf::detail::make_counting_transform_iterator(
0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
return structs_view.get_sliced_child(child_idx, stream);
});
return std::vector<column_view>(it, it + input.num_children());
}();

// Gather the children elements of the prefix min/max struct elements for the output.
auto scanned_children = cudf::detail::gather(table_view{input_children},
gather_map,
out_of_bounds_policy::DONT_CHECK,
negative_index_policy::NOT_ALLOWED,
stream,
mr)
->release();

// Don't need to set a null mask because that will be handled at the caller.
return make_structs_column(
input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
}
};

Expand Down
132 changes: 132 additions & 0 deletions cpp/src/strings/scan/scan_inclusive.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/device_operators.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>

namespace cudf {
namespace strings {
namespace detail {
namespace {

/**
* @brief Min/Max inclusive scan operator
*
* This operator will accept index values, check them and then
* run the `Op` operation on the individual element objects.
* The returned result is the appropriate index value.
*
* This was specifically created to workaround a thrust issue
* https://github.com/NVIDIA/thrust/issues/1479
* where invalid values are passed to the operator.
*/
template <typename Element, typename Op>
struct min_max_scan_operator {
column_device_view const col; ///< strings column device view
Element const null_replacement{}; ///< value used when element is null
bool const has_nulls; ///< true if col has null elements

min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
: col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
{
// verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
}

__device__ inline size_type operator()(size_type lhs, size_type rhs) const
{
// thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
// in these cases the return value does not matter since the result is not used
if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
Element d_lhs =
has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
Element d_rhs =
has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
}
};

struct null_iterator {
bitmask_type const* mask;
__device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
};

} // namespace

template <typename Op>
std::unique_ptr<column> scan_inclusive(column_view const& input,
bitmask_type const* mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto d_input = column_device_view::create(input, stream);

// build indices of the scan operation results
rmm::device_uvector<size_type> result_map(input.size(), stream);
thrust::inclusive_scan(rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(input.size()),
result_map.begin(),
min_max_scan_operator<cudf::string_view, Op>{*d_input, input.has_nulls()});

if (input.has_nulls()) {
// fill the null rows with out-of-bounds values so gather records them as null;
// this prevents un-sanitized null entries in the output
auto null_itr = cudf::detail::make_counting_transform_iterator(0, null_iterator{mask});
auto oob_val = thrust::constant_iterator<size_type>(input.size());
thrust::scatter_if(rmm::exec_policy(stream),
oob_val,
oob_val + input.size(),
thrust::counting_iterator<size_type>(0),
null_itr,
result_map.data());
}

// call gather using the indices to build the output column
auto result_table = cudf::detail::gather(cudf::table_view({input}),
result_map,
cudf::out_of_bounds_policy::NULLIFY,
cudf::detail::negative_index_policy::NOT_ALLOWED,
stream,
mr);
return std::move(result_table->release().front());
}

template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
bitmask_type const* mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
bitmask_type const* mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace detail
} // namespace strings
} // namespace cudf
Loading

0 comments on commit 7da0336

Please sign in to comment.