diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 32a753c9f40..e4b9cbf8921 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -368,8 +368,13 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_single_pass_aggs.cu + src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu + src/groupby/hash/hash_compound_agg_finalizer.cu + src/groupby/hash/sparse_to_dense_results.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index fce8adb4c06..311539efbfc 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of |Value type|Scalar class|Notes| |-|-|-| |fixed-width|`fixed_width_scalar`| `T` can be any fixed-width type| -|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`| +|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`| |fixed-point|`fixed_point_scalar` | `T` can be `numeric::decimal32` or `numeric::decimal64`| |timestamp|`timestamp_scalar` | `T` can be `timestamp_D`, `timestamp_s`, etc.| |duration|`duration_scalar` | `T` can be `duration_D`, `duration_s`, etc.| diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index dfb646c66c4..4159e324472 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,6 @@ #include #include -#include #include #include @@ -256,7 +256,7 @@ struct scatter_gather_functor { cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - rmm::device_scalar null_count{0, stream}; + cudf::detail::device_scalar null_count{0, stream}; if (output.nullable()) { // Have to initialize the output mask to all zeros because we may update // it with atomicOr(). diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index a70cd5a0661..5dc75b1a3fb 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -19,12 +19,11 @@ #include #include #include +#include #include #include #include -#include - #include #include @@ -171,7 +170,7 @@ std::unique_ptr copy_if_else(bool nullable, // if we have validity in the output if (nullable) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; // call the kernel copy_if_else_kernel diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 3aa136d630b..fcb80fe45f7 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin, auto grid = cudf::detail::grid_1d{num_items, block_size, 1}; if (target.nullable()) { - rmm::device_scalar null_count(target.null_count(), stream); + cudf::detail::device_scalar null_count(target.null_count(), stream); auto kernel = copy_range_kernel; diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp new file mode 100644 index 00000000000..16ca06c6561 --- /dev/null +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +template +class device_scalar : public rmm::device_scalar { + public: +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + ~device_scalar() = default; + +// Implementation is the same as what compiler should generate +// Could not use default move constructor as 11.8 compiler fails to generate it +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + device_scalar(device_scalar&& other) noexcept + : rmm::device_scalar{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)} + { + } + device_scalar& operator=(device_scalar&&) noexcept = default; + + device_scalar(device_scalar const&) = delete; + device_scalar& operator=(device_scalar const&) = delete; + + device_scalar() = delete; + + explicit device_scalar( + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + explicit device_scalar( + T const& initial_value, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + bounce_buffer[0] = initial_value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + device_scalar(device_scalar const& other, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(other, stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + [[nodiscard]] T value(rmm::cuda_stream_view stream) const + { + cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); + return bounce_buffer[0]; + } + + void set_value_async(T const& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_async(T&& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = std::move(value); + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); } + + private: + mutable cudf::detail::host_vector bounce_buffer; +}; + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 327c732716c..482265d633e 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include #include @@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op, "Mask pointer cannot be null"); rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_counter{0, stream, mr}; + cudf::detail::device_scalar d_counter{0, stream, mr}; rmm::device_uvector d_masks(masks.size(), stream, mr); rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index cfb2e70bfed..af182b69c3a 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include @@ -101,7 +101,7 @@ std::pair valid_if(InputIterator begin, size_type null_count{0}; if (size > 0) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; constexpr size_type block_size{256}; grid_1d grid{size, block_size}; diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 66be2a12fbe..360dde11fc0 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -94,8 +95,8 @@ class scalar { [[nodiscard]] bool const* validity_data() const; protected: - data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar - rmm::device_scalar _is_valid; ///< Device bool signifying validity + data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar + cudf::detail::device_scalar _is_valid; ///< Device bool signifying validity /** * @brief Move constructor for scalar. diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4ca05f9c335..e6659f76c7c 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include #include #include @@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, cudf::detail::grid_1d grid(num_words, block_size); - rmm::device_scalar non_zero_count(0, stream); + cudf::detail::device_scalar non_zero_count(0, stream); count_set_bits_kernel <<>>( diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b8e140f1fa5..d8419760120 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,7 @@ size_type concatenate_masks(device_span d_views, size_type output_size, rmm::cuda_stream_view stream) { - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(output_size, block_size); concatenate_masks_kernel @@ -265,7 +266,7 @@ std::unique_ptr fused_concatenate(host_span views, auto out_view = out_col->mutable_view(); auto d_out_view = mutable_column_device_view::create(out_view, stream); - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); // Launch kernel constexpr size_type block_size{256}; diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index 29a28f81d1a..80b0bd5242f 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +72,7 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), @@ -155,8 +156,8 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); - rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_data(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu new file mode 100644 index 00000000000..59457bea694 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_groupby.hpp" +#include "compute_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "sparse_to_dense_results.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, + size_type num_keys, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector populated_keys(num_keys, stream); + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); + return populated_keys; +} + +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto const set = cuco::static_set{ + num_keys, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + // Compute all single pass aggs first + compute_single_pass_aggs(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set.ref(cuco::insert_and_find), + requests, + &sparse_results, + stream); + + // Extract the populated indices from the hash set and create a gather map. + // Gathering using this map from sparse results will give dense results. + auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + static_cast(row_bitmask.data()), + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + +template rmm::device_uvector extract_populated_keys( + global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template rmm::device_uvector extract_populated_keys( + nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp new file mode 100644 index 00000000000..7bb3a60ff07 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of key hash set + * + * @param key_set Key hash set + * @param num_keys Number of input keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, + size_type num_keys, + rmm::cuda_stream_view stream); + +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + * + * @tparam Equal Device row comparator type + * @tparam Hash Device row hasher type + * + * @param keys Table whose rows act as the groupby keys + * @param requests The set of columns to aggregate and the aggregations to perform + * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not + * @param d_row_equal Device row comparator + * @param d_row_hash Device row hasher + * @param cache Dense aggregation results + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return Table of unique keys + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu new file mode 100644 index 00000000000..e292543e6e9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +void compute_single_pass_aggs(int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + + // make table that will hold sparse results + table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + auto const d_aggs = cudf::detail::make_device_uvector_async( + agg_kinds, stream, cudf::get_current_device_resource_ref()); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_keys, + hash::compute_single_pass_aggs_fn{ + set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls}); + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } +} + +template void compute_single_pass_aggs>( + int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + hash_set_ref_t set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); + +template void compute_single_pass_aggs>( + int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_hash_set_ref_t set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp new file mode 100644 index 00000000000..a7434bdf61a --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +void compute_single_pass_aggs(int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu new file mode 100644 index 00000000000..22fa4fc584c --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_sparse_results_table.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +// make table that will hold sparse results +cudf::table create_sparse_results_table(table_view const& flattened_values, + std::vector aggs, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + sparse_columns.reserve(flattened_values.num_columns()); + std::transform( + flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + bool nullable = + (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); + auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + + auto col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + + table sparse_table(std::move(sparse_columns)); + mutable_table_view table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(table_view, aggs, stream); + return sparse_table; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp new file mode 100644 index 00000000000..c1d4e0d3f20 --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +// make table that will hold sparse results +cudf::table create_sparse_results_table(table_view const& flattened_values, + std::vector aggs_kinds, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 0432b9d120a..30e1d52fdbf 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,60 +14,32 @@ * limitations under the License. */ -#include "flatten_single_pass_aggs.hpp" +#include "compute_groupby.hpp" #include "groupby/common/utils.hpp" -#include "groupby_kernels.cuh" -#include "var_hash_functor.cuh" +#include "helpers.cuh" #include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -#include -#include -#include +#include #include #include -#include #include #include -#include #include #include -#include #include #include #include -#include -#include -#include - +#include #include -#include #include +#include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { namespace { - -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance -using probing_scheme_type = cuco::linear_probing< - 1, ///< Number of threads used to handle each input key - cudf::experimental::row::hash::device_row_hasher>; - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -112,413 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; - -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ -template -void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - -// make table that will hold sparse results -auto create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; - - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - auto const skip_key_rows_with_nulls = - keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - - auto row_bitmask = - skip_key_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{set, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -/** - * @brief Computes and returns a device vector containing all populated keys in - * `map`. - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -std::unique_ptr
groupby(table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool const keys_have_nulls, - null_policy const include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr
dispatch_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool const keys_have_nulls, + null_policy const include_null_keys, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - auto const null_keys_are_equal = null_equality::EQUAL; - auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const null_keys_are_equal = null_equality::EQUAL; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = cuco::static_set{ - num_keys, - 0.5, // desired load factor - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, - cuco::thread_scope_device, - cuco::storage<1>{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - compute_single_pass_aggs(keys, - requests, - &sparse_results, - set.ref(cuco::insert_and_find), - keys_have_nulls, - include_null_keys, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - keys_have_nulls, - include_null_keys, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - }; - if (cudf::detail::has_nested_columns(keys)) { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } else { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } } - } // namespace /** @@ -559,11 +151,8 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); + dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu new file mode 100644 index 00000000000..37a61c1a22c --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +hash_compound_agg_finalizer::hash_compound_agg_finalizer( + column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) +{ + result_type = + cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type(); +} + +template +auto hash_compound_agg_finalizer::to_dense_agg_result(cudf::aggregation const& agg) +{ + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); +} + +template +auto hash_compound_agg_finalizer::gather_argminmax(aggregation const& agg) +{ + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); +} + +template +void hash_compound_agg_finalizer::visit(cudf::aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::min_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::max_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + col.size(), + var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template class hash_compound_agg_finalizer>; +template class hash_compound_agg_finalizer>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp new file mode 100644 index 00000000000..8bee1a92c40 --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + auto to_dense_agg_result(cudf::aggregation const& agg); + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(cudf::aggregation const& agg); + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override; + + void visit(cudf::detail::min_aggregation const& agg) override; + + void visit(cudf::detail::max_aggregation const& agg) override; + + void visit(cudf::detail::mean_aggregation const& agg) override; + + void visit(cudf::detail::var_aggregation const& agg) override; + + void visit(cudf::detail::std_aggregation const& agg) override; +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh new file mode 100644 index 00000000000..0d117ca35b3 --- /dev/null +++ b/cpp/src/groupby/hash/helpers.cuh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested +// types and `cg_size = 1`for flat data to improve performance +/// Number of threads to handle each input element +CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; + +/// Number of slots per thread +CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; + +/// Thread block size +CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; + +/// Threshold cardinality to switch between shared memory aggregations and global memory +/// aggregations +CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; + +// We add additional `block_size`, because after the number of elements in the local hash set +// exceeds the threshold, all threads in the thread block can still insert one more element. +/// The maximum number of elements handled per block +CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = + GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; + +// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy +/// Shared memory hash set extent type +using shmem_extent_t = + cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; + +/// Number of windows needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr window_extent = + cuco::make_window_extent(shmem_extent_t{}); + +/** + * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. + */ +CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) +{ + std::size_t constexpr base = 8; + return cudf::util::div_rounding_up_safe(num, base) * base; +} + +using row_hash_t = + cudf::experimental::row::hash::device_row_hasher; + +/// Probing scheme type used by groupby hash table +using probing_scheme_t = cuco::linear_probing; + +using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +using nullable_global_set_t = cuco::static_set, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template +using hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; + +template +using nullable_hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh similarity index 95% rename from cpp/src/groupby/hash/groupby_kernels.cuh rename to cpp/src/groupby/hash/single_pass_functors.cuh index 86f4d76487f..73791b3aa71 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -21,12 +20,9 @@ #include #include -#include +#include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys @@ -102,8 +98,4 @@ struct compute_single_pass_aggs_fn { } } }; - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu new file mode 100644 index 00000000000..e1c2cd22309 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp new file mode 100644 index 00000000000..3a2b3090b99 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Gather sparse aggregation results into dense using `gather_map` and add to + * `dense_results` + * + * @tparam SetRef Device hash set ref type + * + * @param[in] requests The set of columns to aggregate and the aggregations to perform + * @param[in] sparse_results Sparse aggregation results + * @param[out] dense_results Dense aggregation results + * @param[in] gather_map Gather map indicating valid elements in `sparse_results` + * @param[in] set Device hash set ref + * @param[in] row_bitmask Bitmask indicating the validity of input keys + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device memory resource used to allocate the returned table + */ +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index 82d557b9f7e..d6c900fb689 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,7 @@ struct quantiles_functor { auto values_view = column_device_view::create(values, stream); auto group_size_view = column_device_view::create(group_sizes, stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); // For each group, calculate quantile if (!cudf::is_dictionary(values.type())) { diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 86ee20dbbe2..c3dfac46502 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -134,7 +134,7 @@ struct var_functor { // set nulls auto result_view = mutable_column_device_view::create(*result, stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); auto d_null_count = null_count.data(); thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index a2874b46b06..fc1b0226a48 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,6 @@ #include #include -#include #include #include @@ -60,7 +60,7 @@ template struct is_device_scalar : public std::false_type {}; template -struct is_device_scalar> : public std::true_type {}; +struct is_device_scalar> : public std::true_type {}; template struct is_device_uvector : public std::false_type {}; @@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colum // in the offsets buffer. While some arrow implementations may accept a zero-sized // offsets buffer, best practices would be to allocate the buffer with the single value. if (nanoarrow_type == NANOARROW_TYPE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } @@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()(ArrowArray* out if (column.size() == 0) { // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552 if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index f7e8134b68d..7ee652e0239 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -134,12 +134,13 @@ std::vector copy_strings_to_host_sync( // build std::string vector from chars and offsets std::vector host_data; host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + std::transform(std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&h_chars](auto start, auto end) { + return std::string(h_chars.data() + start, end - start); + }); return host_data; }; return to_host(d_column_names->view()); @@ -173,633 +174,79 @@ rmm::device_uvector is_all_nulls_each_column(device_span auto parse_opt = parsing_options(options, stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -NodeIndexT get_row_array_parent_col_id(device_span col_ids, - bool is_enabled_lines, - rmm::cuda_stream_view stream) -{ - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; -} -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -using hashmap_of_device_columns = - std::unordered_map>; - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream); - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are inserted into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - // make a copy - auto sorted_col_ids = cudf::detail::make_device_uvector_async( - col_ids, stream, cudf::get_current_device_resource_ref()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = - get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.cbegin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_std_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return std::vector(); - }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); - - scatter_offsets(tree, - col_ids, - row_offsets, - node_ids, - sorted_col_ids, - d_column_tree, - ignore_vals, - columns, - stream); -} - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - auto num_columns = d_unique_col_ids.size(); - stream.synchronize(); - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - // use hash map because we may skip field name's col_ids - hashmap_of_device_columns columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::fill(ignore_vals.begin(), ignore_vals.end(), false); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - // for columns that are not mixed type but have been forced as string - std::vector forced_as_string_column(num_columns); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto const& col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned or if parent - // has been forced as string, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || - forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = true; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = true; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = true; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - this_column_category = NC_STR; - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - col.forced_as_string_column = true; - forced_as_string_column[this_col_id] = true; - } - - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = true; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async( - d_column_tree.node_categories, column_categories, stream); - } - - // ignore all children of columns forced as string - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { - forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = true; - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and - forced_as_string_column[this_col_id]) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, column_categories, stream); - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - return {ignore_vals, columns}; -} - -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_columns = d_column_tree.node_categories.size(); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy_nosync(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; } }); + return is_all_nulls; +} - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); } - stream.synchronize(); + return value; } +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); -namespace experimental { +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); std::map unified_schema(cudf::io::json_reader_options const& options) { @@ -829,19 +276,6 @@ std::map unified_schema(cudf::io::json_reader_optio options.get_dtypes()); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - /** * @brief Constructs `d_json_column` from node tree representation * Newly constructed columns are inserted into `root`'s children. @@ -1033,7 +467,7 @@ std::pair, hashmap_of_device_columns> build_tree std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns()); // prune all children of a column, but not self. - auto ignore_all_children = [&](auto parent_col_id) { + auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) { std::deque offspring; if (adj.count(parent_col_id)) { for (auto const& child : adj[parent_col_id]) { @@ -1392,6 +826,145 @@ std::pair, hashmap_of_device_columns> build_tree return {is_pruned, columns}; } -} // namespace experimental + +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 912e93d52ae..4584f71775f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -485,16 +485,6 @@ std::pair, std::vector> device_json_co } } -template -auto make_device_json_column_dispatch(bool experimental, Args&&... args) -{ - if (experimental) { - return experimental::make_device_json_column(std::forward(args)...); - } else { - return make_device_json_column(std::forward(args)...); - } -} - table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -553,16 +543,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - make_device_json_column_dispatch(options.is_enabled_experimental(), - d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); + make_device_json_column(d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 2d435dc8e1a..34a87918e57 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include @@ -316,7 +316,7 @@ void normalize_single_quotes(datasource::owning_buffer& inda stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); - rmm::device_scalar outbuf_size(stream, mr); + cudf::detail::device_scalar outbuf_size(stream, mr); parser.Transduce(reinterpret_cast(indata.data()), static_cast(indata.size()), static_cast(outbuf.data()), @@ -401,7 +401,7 @@ std:: stream); rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); - rmm::device_scalar outbuf_indices_size(stream, mr); + cudf::detail::device_scalar outbuf_indices_size(stream, mr); parser.Transduce(inbuf.data(), static_cast(inbuf.size()), thrust::make_discard_iterator(), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 3d9a51833e0..f6be4539d7f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -405,21 +405,6 @@ void make_device_json_column(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); -namespace experimental { -/** - * @copydoc cudf::io::json::detail::make_device_json_column - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -} // namespace experimental - /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 69a51fab5dc..534b30a6089 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -21,6 +21,7 @@ #include "nested_json.hpp" #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include -#include #include #include @@ -1446,7 +1446,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - rmm::device_scalar d_num_stack_ops(stream); + cudf::detail::device_scalar d_num_stack_ops(stream); // Sequence of stack symbols and their position in the original input (sparse representation) rmm::device_uvector stack_ops{json_in.size(), stream}; @@ -1519,7 +1519,7 @@ std::pair, rmm::device_uvector> pr stream); auto const mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_num_selected_tokens(stream, mr); + cudf::detail::device_scalar d_num_selected_tokens(stream, mr); rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; @@ -1638,7 +1638,7 @@ std::pair, rmm::device_uvector> ge std::size_t constexpr max_tokens_per_struct = 6; auto const max_token_out_count = cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct; - rmm::device_scalar num_written_tokens{stream}; + cudf::detail::device_scalar num_written_tokens{stream}; // In case we're recovering on invalid JSON lines, post-processing the token stream requires to // see a JSON-line delimiter as the very first item SymbolOffsetT const delimiter_offset = diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index a1e4aa65dcf..c42348a165f 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -22,6 +22,7 @@ #include "io/utilities/hostdevice_span.hpp" #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include #include #include @@ -451,7 +451,7 @@ void decode_stream_data(int64_t num_dicts, update_null_mask(chunks, out_buffers, stream, mr); } - rmm::device_scalar error_count(0, stream); + cudf::detail::device_scalar error_count(0, stream); gpu::DecodeOrcColumnData(chunks.base_device_ptr(), global_dict.data(), row_groups, diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp index f0fc9fab3ab..8b3d1d7a6c3 100644 --- a/cpp/src/io/parquet/error.hpp +++ b/cpp/src/io/parquet/error.hpp @@ -26,7 +26,7 @@ namespace cudf::io::parquet { /** - * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in + * @brief Specialized device scalar for use in reporting errors that occur in * kernel calls. * * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 4f6d41a97da..be502b581af 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -22,14 +22,13 @@ #include "io/parquet/parquet_common.hpp" #include "io/statistics/statistics.cuh" #include "io/utilities/column_buffer.hpp" -#include "io/utilities/hostdevice_vector.hpp" +#include #include #include #include #include -#include #include #include diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index f70171eef68..0c49b2e5d78 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -800,7 +801,7 @@ template static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type col_size, rmm::device_buffer&& null_mask, - rmm::device_scalar& d_null_count, + cudf::detail::device_scalar& d_null_count, cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -930,7 +931,7 @@ std::unique_ptr parse_data( CUDF_FUNC_RANGE(); if (col_size == 0) { return make_empty_column(col_type); } - auto d_null_count = rmm::device_scalar(null_count, stream); + auto d_null_count = cudf::detail::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); if (null_mask.is_empty()) { null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr); diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index 43dc38c4ac6..af32b207d20 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -18,11 +18,10 @@ #include "io/utilities/string_parsing.hpp" #include "io/utilities/trie.cuh" +#include #include #include -#include - #include #include @@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, constexpr int block_size = 128; auto const grid_size = (size + block_size - 1) / block_size; - auto d_column_info = rmm::device_scalar(stream); + auto d_column_info = cudf::detail::device_scalar(stream); CUDF_CUDA_TRY(cudaMemsetAsync( d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 2ec23e0dc6d..40d1c925889 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -81,7 +82,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -94,7 +95,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = size.value(stream); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); @@ -197,7 +198,7 @@ conditional_join(table_view const& left, join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -231,7 +232,7 @@ conditional_join(table_view const& left, std::make_unique>(0, stream, mr)); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); @@ -342,7 +343,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left, auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); // Determine number of output rows without actually building the output to simply // find what the size of the output will be. diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index c7294152982..515d28201e8 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -27,7 +27,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 84e9be45030..4049ccf35e1 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size( rmm::device_async_resource_ref mr) { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); compute_mixed_join_output_size <<>>( diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 59fdbedf089..fb5cf66dd60 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1031,7 +1032,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); // compute results - rmm::device_scalar d_valid_count{0, stream}; + cudf::detail::device_scalar d_valid_count{0, stream}; get_json_object_kernel <<>>( diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index 67ea29a2cb1..890625830a5 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct all_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(1, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(1, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index 057f038c622..d70da369d72 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct any_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(0, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(0, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 139de068050..4f6eb23ce5b 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -69,18 +70,18 @@ struct minmax_pair { * @param num_items number of items to reduce * @param binary_op binary operator used to reduce * @param stream CUDA stream to run kernels on. - * @return rmm::device_scalar + * @return cudf::detail::device_scalar */ template ::type> -rmm::device_scalar reduce_device(InputIterator d_in, - size_type num_items, - Op binary_op, - rmm::cuda_stream_view stream) +auto reduce_device(InputIterator d_in, + size_type num_items, + Op binary_op, + rmm::cuda_stream_view stream) { OutputType identity{}; - rmm::device_scalar result{identity, stream}; + cudf::detail::device_scalar result{identity, stream}; // Allocate temporary storage size_t storage_bytes = 0; diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 1df1549432f..d0e3358cc34 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder { auto device_out = cudf::mutable_column_device_view::create(output_view, stream); auto device_replacement = cudf::column_device_view::create(replacement, stream); - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); replace<<>>( diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 86ec8cfc91e..0cc97ca05e0 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,6 @@ #include #include -#include #include #include @@ -182,7 +182,7 @@ struct replace_kernel_forwarder { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); auto replace = [&] { diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 528700137bf..bc0ee2eb519 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,6 @@ #include #include -#include #include #include @@ -1105,7 +1105,7 @@ struct rolling_window_launcher { auto const d_inp_ptr = column_device_view::create(input, stream); auto const d_default_out_ptr = column_device_view::create(default_outputs, stream); auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream); - auto d_valid_count = rmm::device_scalar{0, stream}; + auto d_valid_count = cudf::detail::device_scalar{0, stream}; auto constexpr block_size = 256; auto const grid = cudf::detail::grid_1d(input.size(), block_size); @@ -1271,7 +1271,7 @@ std::unique_ptr rolling_window_udf(column_view const& input, udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); auto output_view = output->mutable_view(); - rmm::device_scalar device_valid_count{0, stream}; + cudf::detail::device_scalar device_valid_count{0, stream}; std::string kernel_name = jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 4c015f3cbed..6a7c8ea45e9 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -348,7 +349,7 @@ std::unique_ptr convert_case(strings_column_view const& input, // This check incurs ~20% performance hit for smaller strings and so we only use it // after the threshold check above. The check makes very little impact for long strings // but results in a large performance gain when the input contains only single-byte characters. - rmm::device_scalar mb_count(0, stream); + cudf::detail::device_scalar mb_count(0, stream); // cudf::detail::grid_1d is limited to size_type elements auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size); // we only need to check every other byte since either will contain high bit diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 1d9d12686eb..9e4ef47ff79 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -242,7 +242,7 @@ std::unique_ptr concatenate(host_span columns, } { // Copy offsets columns with single kernel launch - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(offsets_count, block_size); diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu index 8a8001dd81a..957075017ba 100644 --- a/cpp/src/strings/replace/find_replace.cu +++ b/cpp/src/strings/replace/find_replace.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 352d883bdc5..88f343926c9 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -334,7 +334,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = util::div_rounding_up_safe( util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); count_targets<<>>(fn, chars_bytes, d_count.data()); diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 16df0dbabdf..52ddef76c1a 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -285,7 +285,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_target_count(0, stream); + cudf::detail::device_scalar d_target_count(0, stream); constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; auto const num_blocks = util::div_rounding_up_safe( diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 81aca001d53..4b777be9d5b 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -361,7 +362,7 @@ std::pair, rmm::device_uvector> split cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); if (chars_bytes > 0) { constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index df25950e6d5..89ca8a089d6 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -221,7 +222,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = cudf::util::div_rounding_up_safe( cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size);