You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#include<cudf/binaryop.hpp>
#include<cudf/column/column_factories.hpp>
#include<cudf/scalar/scalar.hpp>
#include<cudf/scalar/scalar_factories.hpp>
#include<cudf/types.hpp>
#include<vector>intmain(void) {
cudf::size_type num_rows = 1799989091;
auto s = cudf::make_default_constructed_scalar(
cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
auto a = cudf::make_column_from_scalar(*s, num_rows);
auto b = cudf::make_column_from_scalar(*s, num_rows);
auto mask = cudf::binary_operation(a->view(), b->view(),
cudf::binary_operator::GREATER,
cudf::data_type(cudf::type_id::BOOL8));
}
Run under compute-sanitizer, this produces:
========= Invalid __global__ read of size 4 bytes
========= at void cudf::binops::compiled::for_each_kernel<cudf::binops::compiled::binary_op_device_dispatcher<cudf::binops::compiled::ops::Greater>>(int, T1)+0x220d0
========= by thread (739,0,0) in block (452470,0,0)
========= Address 0x77685800338c is out of bounds
========= and is 6475992180 bytes before the nearest allocation at 0x7769da000000 of size 1799989104 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x2f26ef]
========= in /usr/lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x15a13]
========= in /home/coder/.conda/envs/rapids/lib/libcudart.so.12
========= Host Frame:cudaLaunchKernel [0x75750]
========= in /home/coder/.conda/envs/rapids/lib/libcudart.so.12
========= Host Frame:void cudf::binops::compiled::for_each<cudf::binops::compiled::binary_op_device_dispatcher<cudf::binops::compiled::ops::Greater> >(rmm::cuda_stream_view, int, cudf::binops::compiled::binary_op_device_dispatcher<cudf::binops::compiled::ops::Greater>) [0x4385b9]
========= in /home/coder/cudf/cpp/build/conda/cuda-12.5/release/libcudf.so
========= Host Frame:void cudf::binops::compiled::apply_binary_op<cudf::binops::compiled::ops::Greater>(cudf::mutable_column_view&, cudf::column_view const&, cudf::column_view const&, bool, bool, rmm::cuda_stream_view) [0x43946b]
========= in /home/coder/cudf/cpp/build/conda/cuda-12.5/release/libcudf.so
The relevant part of the binops code is:
template <typename Functor>
CUDF_KERNEL voidfor_each_kernel(cudf::size_type size, Functor f)
{
int tid = threadIdx.x;
int blkid = blockIdx.x;
int blksz = blockDim.x;
int gridsz = gridDim.x;
int start = tid + blkid * blksz;
int step = blksz * gridsz;
#pragma unroll
for (cudf::size_type i = start; i < size; i += step) {
f(i);
}
}
/** * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. * * @tparam Functor * @param stream CUDA stream used for device memory operations and kernel launches. * @param size number of elements to process. * @param f Functor object to call for each element.*/template <typename Functor>
voidfor_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
{
int block_size;
int min_grid_size;
CUDF_CUDA_TRY(
cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
// 2 elements per thread.intconst grid_size = util::div_rounding_up_safe(size, 2 * block_size);
for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
}
On my system, assuming the for each kernel specializes to f := out[i] = left[i] > right[i];, then I get:
block_size = 768;
grid_size = 1171868;
and
// This is used as the step in the grid-stride loop
step = block_size * grid_size = 899994624;
But, num_rows + step > std::numeric_limits<size_type>::max(), so the grid-stride loop overflows in the increment step.
The text was updated successfully, but these errors were encountered:
For large columns, the computed stride might end up overflowing
size_type. To fix this, use the grid_1d helper. See also rapidsai#10368.
- Closesrapidsai#17353
Consider:
Run under compute-sanitizer, this produces:
The relevant part of the binops code is:
On my system, assuming the for each kernel specializes to
f := out[i] = left[i] > right[i];
, then I get:and
But,
num_rows + step > std::numeric_limits<size_type>::max()
, so the grid-stride loop overflows in the increment step.The text was updated successfully, but these errors were encountered: