Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch of fixes for index overflows in grid stride loops. #10448

Merged
merged 10 commits into from
Mar 23, 2022
5 changes: 3 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
size_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);
Expand All @@ -58,7 +59,7 @@ __global__ void valid_if_kernel(
output[cudf::word_index(i)] = ballot;
warp_valid_count += __popc(ballot);
}
i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < size);
}

Expand Down
9 changes: 5 additions & 4 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ class mutable_table_view;
* @file
*/

using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using thread_index_type = int64_t;

/**
* @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
Expand Down
14 changes: 10 additions & 4 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,7 +47,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
size_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;

while (row < num_scatter_rows) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type const output_row = scatter_map[row];
Expand All @@ -58,7 +59,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
destination.set_null(output_row);
}

row += blockDim.x * gridDim.x;
row += stride;
}
}

Expand Down Expand Up @@ -351,8 +352,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>

// Transform negative indices to index + target size
auto scatter_rows = indices.size();
// note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in`
// > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation
// using a signed 64 bit value.
auto scatter_iter = thrust::make_transform_iterator(
map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; });
map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type {
return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows;
});

// Dispatch over data type per column
auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
Expand Down
15 changes: 9 additions & 6 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
char* chars,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -98,7 +99,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}

Expand All @@ -114,8 +115,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::mutable_column_device_view output,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand All @@ -141,7 +143,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
}
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}
if (replacement_has_nulls) {
Expand Down Expand Up @@ -247,6 +249,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<

std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);

auto offsets_view = offsets->mutable_view();

auto const bytes =
Expand Down
26 changes: 14 additions & 12 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,13 +25,13 @@ namespace rolling {
namespace jit {

template <typename WindowType>
cudf::size_type __device__ get_window(WindowType window, cudf::size_type index)
cudf::size_type __device__ get_window(WindowType window, cudf::thread_index_type index)
{
return window[index];
}

template <>
cudf::size_type __device__ get_window(cudf::size_type window, cudf::size_type index)
cudf::size_type __device__ get_window(cudf::size_type window, cudf::thread_index_type index)
{
return window;
}
Expand All @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
FollowingWindowType following_window_begin,
cudf::size_type min_periods)
{
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type stride = blockDim.x * gridDim.x;
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

cudf::size_type warp_valid_count{0};

Expand All @@ -62,14 +62,16 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
// for CUDA 10.0 and below (fixed in CUDA 10.1)
volatile cudf::size_type count = 0;

cudf::size_type preceding_window = get_window(preceding_window_begin, i);
cudf::size_type following_window = get_window(following_window_begin, i);
int64_t const preceding_window = get_window(preceding_window_begin, i);
int64_t const following_window = get_window(following_window_begin, i);

// compute bounds
cudf::size_type start = min(nrows, max(0, i - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, i + following_window + 1));
cudf::size_type start_index = min(start, end);
cudf::size_type end_index = max(start, end);
auto const start = static_cast<cudf::size_type>(
min(static_cast<int64_t>(nrows), max(int64_t{0}, i - preceding_window + 1)));
auto const end = static_cast<cudf::size_type>(
min(static_cast<int64_t>(nrows), max(int64_t{0}, i + following_window + 1)));
auto const start_index = min(start, end);
auto const end_index = max(start, end);

// aggregate
// TODO: We should explore using shared memory to avoid redundant loads.
Expand All @@ -79,7 +81,7 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);

// check if we have enough input samples
bool output_is_valid = (count >= min_periods);
bool const output_is_valid = (count >= min_periods);

// set the mask
const unsigned int result_mask = __ballot_sync(active_threads, output_is_valid);
Expand Down
22 changes: 11 additions & 11 deletions cpp/src/rolling/rolling_detail.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1008,24 +1008,24 @@ __launch_bounds__(block_size) __global__
PrecedingWindowIterator preceding_window_begin,
FollowingWindowIterator following_window_begin)
{
size_type i = blockIdx.x * block_size + threadIdx.x;
size_type stride = block_size * gridDim.x;
thread_index_type i = blockIdx.x * block_size + threadIdx.x;
thread_index_type const stride = block_size * gridDim.x;

size_type warp_valid_count{0};

auto active_threads = __ballot_sync(0xffffffff, i < input.size());
while (i < input.size()) {
// to prevent overflow issues when computing bounds use int64_t
int64_t preceding_window = preceding_window_begin[i];
int64_t following_window = following_window_begin[i];
int64_t const preceding_window = preceding_window_begin[i];
int64_t const following_window = following_window_begin[i];

// compute bounds
auto start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
auto end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
size_type start_index = min(start, end);
size_type end_index = max(start, end);
auto const start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(int64_t{0}, i - preceding_window + 1)));
auto const end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(int64_t{0}, i + following_window + 1)));
auto const start_index = min(start, end);
auto const end_index = max(start, end);

// aggregate
// TODO: We should explore using shared memory to avoid redundant loads.
Expand All @@ -1037,7 +1037,7 @@ __launch_bounds__(block_size) __global__
input, default_outputs, output, start_index, end_index, i);

// set the mask
cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)};

// only one thread writes the mask
if (0 == threadIdx.x % cudf::detail::warp_size) {
Expand Down
9 changes: 5 additions & 4 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,12 +67,13 @@ __launch_bounds__(max_block_size) __global__

auto thread_intermediate_storage =
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
auto const start_idx =
static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
for (thread_index_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
}
Expand Down