From 22a96df3a85acbb76f1564dee86ce452c1ee1960 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 15 Mar 2022 15:22:43 -0500 Subject: [PATCH 01/10] Grid stride loop index overflow fix. --- cpp/include/cudf/detail/valid_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index ee9e4b2c687..28e5ad9a8ad 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -48,7 +48,7 @@ __global__ void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - size_type i = threadIdx.x + blockIdx.x * blockDim.x; + std::size_t i = threadIdx.x + blockIdx.x * blockDim.x; size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size); From 797c723a9dcc5834eb2ea28fa86d29fa480d8675 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 11:25:39 -0500 Subject: [PATCH 02/10] Fix 2 size_type overflows in scalar cudf::scatter() --- cpp/src/copying/scatter.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 9a364451b3b..4eba43dca19 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,7 +47,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, MapIterator scatter_map, size_type num_scatter_rows) { - size_type row = threadIdx.x + blockIdx.x * blockDim.x; + std::size_t row = threadIdx.x + blockIdx.x * blockDim.x; while (row < num_scatter_rows) { size_type const output_row = scatter_map[row]; @@ -351,8 +351,10 @@ std::unique_ptr scatter(std::vector> // Transform negative indices to index + target size auto scatter_rows = indices.size(); + // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in` > (2^31)/2, + // but the end result after the final (% n_rows) will fit. so we'll do the computation using a signed 64 bit value. auto scatter_iter = thrust::make_transform_iterator( - map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; }); + map_begin, [n_rows = static_cast(n_rows)] __device__(size_type in) -> size_type { return ((static_cast(in) % n_rows) + n_rows) % n_rows; }); // Dispatch over data type per column auto result = std::vector>(target.num_columns()); From 34cf254dfa437684c9c1801ebf77afb62d5d2029 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 16:38:10 -0500 Subject: [PATCH 03/10] Fix grid stride overflow in gpu_rolling. --- cpp/src/rolling/rolling_detail.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index a121e247258..36952be31a6 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1008,7 +1008,7 @@ __launch_bounds__(block_size) __global__ PrecedingWindowIterator preceding_window_begin, FollowingWindowIterator following_window_begin) { - size_type i = blockIdx.x * block_size + threadIdx.x; + std::size_t i = blockIdx.x * block_size + threadIdx.x; size_type stride = block_size * gridDim.x; size_type warp_valid_count{0}; @@ -1021,9 +1021,9 @@ __launch_bounds__(block_size) __global__ // compute bounds auto start = static_cast( - min(static_cast(input.size()), max(0L, i - preceding_window + 1))); + min(static_cast(input.size()), max(0L, static_cast(i) - preceding_window + 1))); auto end = static_cast( - min(static_cast(input.size()), max(0L, i + following_window + 1))); + min(static_cast(input.size()), max(0L, static_cast(i) + following_window + 1))); size_type start_index = min(start, end); size_type end_index = max(start, end); From 67aa4e3295524ed791ad673d2012bf01eb557fcb Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 17:40:21 -0500 Subject: [PATCH 04/10] Fix grid stride overflow issues in replace_nulls() and replace_nulls_strings() kernels. --- cpp/src/replace/nulls.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index d41bdb6ca5a..4ee259da608 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -63,7 +63,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, cudf::size_type* valid_counter) { cudf::size_type nrows = input.size(); - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -115,7 +115,7 @@ __global__ void replace_nulls(cudf::column_device_view input, cudf::size_type* output_valid_count) { cudf::size_type nrows = input.size(); - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -247,6 +247,7 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< std::unique_ptr offsets = cudf::strings::detail::make_offsets_child_column( sizes_view.begin(), sizes_view.end(), stream, mr); + auto offsets_view = offsets->mutable_view(); auto const bytes = From e5d12b41cc0d16fe2dff205ed86ba5864602e9cf Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 18:22:07 -0500 Subject: [PATCH 05/10] Fix grid stride overflow issue in compute_column_kernel. --- cpp/src/transform/compute_column.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index bf109dbe1e5..70a61036392 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -72,7 +72,7 @@ __launch_bounds__(max_block_size) __global__ auto evaluator = cudf::ast::detail::expression_evaluator(table, device_expression_data); - for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { + for (std::size_t row_index = start_idx; row_index < table.num_rows(); row_index += stride) { auto output_dest = ast::detail::mutable_column_expression_result(output_column); evaluator.evaluate(output_dest, row_index, thread_intermediate_storage); } From 11b4c611f4d18d152060a2336c907332cfee8649 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 19:38:54 -0500 Subject: [PATCH 06/10] Fix grid stride overflow issue in gpu_rolling_new. --- cpp/src/rolling/jit/kernel.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 52e397b9351..b5dc8359a4b 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -51,7 +51,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; cudf::size_type stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0}; @@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, cudf::size_type following_window = get_window(following_window_begin, i); // compute bounds - cudf::size_type start = min(nrows, max(0, i - preceding_window + 1)); - cudf::size_type end = min(nrows, max(0, i + following_window + 1)); + cudf::size_type start = min(nrows, max(0, static_cast(i) - preceding_window + 1)); + cudf::size_type end = min(nrows, max(0, static_cast(i) + following_window + 1)); cudf::size_type start_index = min(start, end); cudf::size_type end_index = max(start, end); From 75f01bd8beaa88ef3df399b7e2e67e0b7d4976f8 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Wed, 16 Mar 2022 19:44:34 -0500 Subject: [PATCH 07/10] Code formatting. --- cpp/src/copying/scatter.cu | 9 ++++++--- cpp/src/replace/nulls.cu | 4 ++-- cpp/src/rolling/jit/kernel.cu | 4 ++-- cpp/src/rolling/rolling_detail.cuh | 12 +++++++----- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 4eba43dca19..45e338f904e 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -351,10 +351,13 @@ std::unique_ptr
scatter(std::vector> // Transform negative indices to index + target size auto scatter_rows = indices.size(); - // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in` > (2^31)/2, - // but the end result after the final (% n_rows) will fit. so we'll do the computation using a signed 64 bit value. + // note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in` + // > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation + // using a signed 64 bit value. auto scatter_iter = thrust::make_transform_iterator( - map_begin, [n_rows = static_cast(n_rows)] __device__(size_type in) -> size_type { return ((static_cast(in) % n_rows) + n_rows) % n_rows; }); + map_begin, [n_rows = static_cast(n_rows)] __device__(size_type in) -> size_type { + return ((static_cast(in) % n_rows) + n_rows) % n_rows; + }); // Dispatch over data type per column auto result = std::vector>(target.num_columns()); diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 4ee259da608..b9c5a8a319c 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -63,7 +63,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, cudf::size_type* valid_counter) { cudf::size_type nrows = input.size(); - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -115,7 +115,7 @@ __global__ void replace_nulls(cudf::column_device_view input, cudf::size_type* output_valid_count) { cudf::size_type nrows = input.size(); - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index b5dc8359a4b..0f140ef1a79 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, cudf::size_type following_window = get_window(following_window_begin, i); // compute bounds - cudf::size_type start = min(nrows, max(0, static_cast(i) - preceding_window + 1)); - cudf::size_type end = min(nrows, max(0, static_cast(i) + following_window + 1)); + cudf::size_type start = min(nrows, max(0, static_cast(i) - preceding_window + 1)); + cudf::size_type end = min(nrows, max(0, static_cast(i) + following_window + 1)); cudf::size_type start_index = min(start, end); cudf::size_type end_index = max(start, end); diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 36952be31a6..2c8890ab860 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1008,7 +1008,7 @@ __launch_bounds__(block_size) __global__ PrecedingWindowIterator preceding_window_begin, FollowingWindowIterator following_window_begin) { - std::size_t i = blockIdx.x * block_size + threadIdx.x; + std::size_t i = blockIdx.x * block_size + threadIdx.x; size_type stride = block_size * gridDim.x; size_type warp_valid_count{0}; @@ -1020,10 +1020,12 @@ __launch_bounds__(block_size) __global__ int64_t following_window = following_window_begin[i]; // compute bounds - auto start = static_cast( - min(static_cast(input.size()), max(0L, static_cast(i) - preceding_window + 1))); - auto end = static_cast( - min(static_cast(input.size()), max(0L, static_cast(i) + following_window + 1))); + auto start = + static_cast(min(static_cast(input.size()), + max(0L, static_cast(i) - preceding_window + 1))); + auto end = + static_cast(min(static_cast(input.size()), + max(0L, static_cast(i) + following_window + 1))); size_type start_index = min(start, end); size_type end_index = max(start, end); From 3de4b512a156c1d70cdf55da11b97fea6113c5f0 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 21 Mar 2022 10:57:42 -0500 Subject: [PATCH 08/10] Add thread_index_type for grid stride loops. --- cpp/include/cudf/detail/valid_if.cuh | 5 +++-- cpp/include/cudf/types.hpp | 9 +++++---- cpp/src/copying/scatter.cu | 5 +++-- cpp/src/replace/nulls.cu | 14 ++++++++------ cpp/src/rolling/jit/kernel.cu | 6 +++--- cpp/src/rolling/rolling_detail.cuh | 4 ++-- cpp/src/transform/compute_column.cu | 9 +++++---- 7 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 28e5ad9a8ad..aa4421bb4ed 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -48,7 +48,8 @@ __global__ void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - std::size_t i = threadIdx.x + blockIdx.x * blockDim.x; + thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x; + thread_index_type const stride = blockDim.x * gridDim.x; size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size); @@ -58,7 +59,7 @@ __global__ void valid_if_kernel( output[cudf::word_index(i)] = ballot; warp_valid_count += __popc(ballot); } - i += blockDim.x * gridDim.x; + i += stride; active_mask = __ballot_sync(active_mask, i < size); } diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 76e2589a5a9..f6496980f17 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -83,10 +83,11 @@ class mutable_table_view; * @file */ -using size_type = int32_t; -using bitmask_type = uint32_t; -using valid_type = uint8_t; -using offset_type = int32_t; +using size_type = int32_t; +using bitmask_type = uint32_t; +using valid_type = uint8_t; +using offset_type = int32_t; +using thread_index_type = int64_t; /** * @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast` diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 45e338f904e..98a90518bcb 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -47,7 +47,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, MapIterator scatter_map, size_type num_scatter_rows) { - std::size_t row = threadIdx.x + blockIdx.x * blockDim.x; + thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x; + thread_index_type const stride = blockDim.x * gridDim.x; while (row < num_scatter_rows) { size_type const output_row = scatter_map[row]; @@ -58,7 +59,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, destination.set_null(output_row); } - row += blockDim.x * gridDim.x; + row += stride; } } diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index b9c5a8a319c..8707a89d9c9 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -62,8 +62,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, char* chars, cudf::size_type* valid_counter) { - cudf::size_type nrows = input.size(); - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::size_type nrows = input.size(); + cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type const stride = blockDim.x * gridDim.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -98,7 +99,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input, if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes()); } - i += blockDim.x * gridDim.x; + i += stride; active_mask = __ballot_sync(active_mask, i < nrows); } @@ -114,8 +115,9 @@ __global__ void replace_nulls(cudf::column_device_view input, cudf::mutable_column_device_view output, cudf::size_type* output_valid_count) { - cudf::size_type nrows = input.size(); - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::size_type nrows = input.size(); + cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type const stride = blockDim.x * gridDim.x; uint32_t active_mask = 0xffffffff; active_mask = __ballot_sync(active_mask, i < nrows); @@ -141,7 +143,7 @@ __global__ void replace_nulls(cudf::column_device_view input, } } - i += blockDim.x * gridDim.x; + i += stride; active_mask = __ballot_sync(active_mask, i < nrows); } if (replacement_has_nulls) { diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 0f140ef1a79..1b8ff5c6a95 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; - cudf::size_type stride = blockDim.x * gridDim.x; + thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + thread_index_type const stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0}; diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 2c8890ab860..f7528c6c930 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1008,8 +1008,8 @@ __launch_bounds__(block_size) __global__ PrecedingWindowIterator preceding_window_begin, FollowingWindowIterator following_window_begin) { - std::size_t i = blockIdx.x * block_size + threadIdx.x; - size_type stride = block_size * gridDim.x; + thread_index_type i = blockIdx.x * block_size + threadIdx.x; + thread_index_type const stride = block_size * gridDim.x; size_type warp_valid_count{0}; diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 70a61036392..bc3678380be 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,12 +67,13 @@ __launch_bounds__(max_block_size) __global__ auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; - auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); - auto const stride = static_cast(blockDim.x * gridDim.x); + auto const start_idx = + static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto const stride = static_cast(blockDim.x * gridDim.x); auto evaluator = cudf::ast::detail::expression_evaluator(table, device_expression_data); - for (std::size_t row_index = start_idx; row_index < table.num_rows(); row_index += stride) { + for (thread_index_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { auto output_dest = ast::detail::mutable_column_expression_result(output_column); evaluator.evaluate(output_dest, row_index, thread_intermediate_storage); } From b167b07a28f0c861b5b21eb9069b50a2d9811333 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Mon, 21 Mar 2022 11:14:07 -0500 Subject: [PATCH 09/10] PR review comments. --- cpp/src/rolling/rolling_detail.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index f7528c6c930..917ce4b6663 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1020,12 +1020,10 @@ __launch_bounds__(block_size) __global__ int64_t following_window = following_window_begin[i]; // compute bounds - auto start = - static_cast(min(static_cast(input.size()), - max(0L, static_cast(i) - preceding_window + 1))); - auto end = - static_cast(min(static_cast(input.size()), - max(0L, static_cast(i) + following_window + 1))); + auto start = static_cast(min( + static_cast(input.size()), max(0L, static_cast(i) - preceding_window + 1))); + auto end = static_cast(min(static_cast(input.size()), + max(0L, static_cast(i) + following_window + 1))); size_type start_index = min(start, end); size_type end_index = max(start, end); From e1bdea4379443a1be3a35fb1e621da4250475933 Mon Sep 17 00:00:00 2001 From: Dave Baranec Date: Tue, 22 Mar 2022 13:29:06 -0500 Subject: [PATCH 10/10] Clean up some declarations and casting. --- cpp/src/rolling/jit/kernel.cu | 24 +++++++++++++----------- cpp/src/rolling/rolling_detail.cuh | 18 +++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 1b8ff5c6a95..4122f3a56a2 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -25,13 +25,13 @@ namespace rolling { namespace jit { template -cudf::size_type __device__ get_window(WindowType window, cudf::size_type index) +cudf::size_type __device__ get_window(WindowType window, cudf::thread_index_type index) { return window[index]; } template <> -cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index) +cudf::size_type __device__ get_window(cudf::size_type window, cudf::thread_index_type index) { return window; } @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, FollowingWindowType following_window_begin, cudf::size_type min_periods) { - thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; - thread_index_type const stride = blockDim.x * gridDim.x; + cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x; + cudf::thread_index_type const stride = blockDim.x * gridDim.x; cudf::size_type warp_valid_count{0}; @@ -62,14 +62,16 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, // for CUDA 10.0 and below (fixed in CUDA 10.1) volatile cudf::size_type count = 0; - cudf::size_type preceding_window = get_window(preceding_window_begin, i); - cudf::size_type following_window = get_window(following_window_begin, i); + int64_t const preceding_window = get_window(preceding_window_begin, i); + int64_t const following_window = get_window(following_window_begin, i); // compute bounds - cudf::size_type start = min(nrows, max(0, static_cast(i) - preceding_window + 1)); - cudf::size_type end = min(nrows, max(0, static_cast(i) + following_window + 1)); - cudf::size_type start_index = min(start, end); - cudf::size_type end_index = max(start, end); + auto const start = static_cast( + min(static_cast(nrows), max(int64_t{0}, i - preceding_window + 1))); + auto const end = static_cast( + min(static_cast(nrows), max(int64_t{0}, i + following_window + 1))); + auto const start_index = min(start, end); + auto const end_index = max(start, end); // aggregate // TODO: We should explore using shared memory to avoid redundant loads. @@ -79,7 +81,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows, OutType val = agg_op::template operate(in_col, start_index, count); // check if we have enough input samples - bool output_is_valid = (count >= min_periods); + bool const output_is_valid = (count >= min_periods); // set the mask const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid); diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 917ce4b6663..0ab8fff9a88 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1016,16 +1016,16 @@ __launch_bounds__(block_size) __global__ auto active_threads = __ballot_sync(0xffffffff, i < input.size()); while (i < input.size()) { // to prevent overflow issues when computing bounds use int64_t - int64_t preceding_window = preceding_window_begin[i]; - int64_t following_window = following_window_begin[i]; + int64_t const preceding_window = preceding_window_begin[i]; + int64_t const following_window = following_window_begin[i]; // compute bounds - auto start = static_cast(min( - static_cast(input.size()), max(0L, static_cast(i) - preceding_window + 1))); - auto end = static_cast(min(static_cast(input.size()), - max(0L, static_cast(i) + following_window + 1))); - size_type start_index = min(start, end); - size_type end_index = max(start, end); + auto const start = static_cast( + min(static_cast(input.size()), max(int64_t{0}, i - preceding_window + 1))); + auto const end = static_cast( + min(static_cast(input.size()), max(int64_t{0}, i + following_window + 1))); + auto const start_index = min(start, end); + auto const end_index = max(start, end); // aggregate // TODO: We should explore using shared memory to avoid redundant loads. @@ -1037,7 +1037,7 @@ __launch_bounds__(block_size) __global__ input, default_outputs, output, start_index, end_index, i); // set the mask - cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)}; + cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)}; // only one thread writes the mask if (0 == threadIdx.x % cudf::detail::warp_size) {