Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent use of invalid grid sizes in ORC reader and writer #17709

Merged
merged 9 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 17 additions & 20 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
device_span<uint32_t const> str_col_indexes)
{
// Index of the column in the `str_col_indexes` array
auto const str_col_idx = blockIdx.y;
auto const str_col_idx = blockIdx.x % str_col_indexes.size();
// Index of the column in the `orc_columns` array
auto const col_idx = str_col_indexes[str_col_idx];
auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const row_group_idx = (blockIdx.x / str_col_indexes.size()) * blockDim.x + threadIdx.x;
if (row_group_idx >= rowgroup_bounds.size().first) { return; }

auto const& str_col = orc_columns[col_idx];
Expand All @@ -63,18 +63,17 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
if (rowgroup_bounds.count() == 0) { return; }

auto const num_rowgroups = rowgroup_bounds.size().first;
auto const num_str_cols = str_col_indexes.size();
if (num_str_cols == 0) { return; }
if (str_col_indexes.empty()) { return; }

int block_size = 0; // suggested thread count to use
int min_grid_size = 0; // minimum block count required
CUDF_CUDA_TRY(
cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel));
auto const grid_size =
dim3(cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size),
static_cast<unsigned int>(num_str_cols));
auto const num_blocks =
cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size) *
str_col_indexes.size();

rowgroup_char_counts_kernel<<<grid_size, block_size, 0, stream.value()>>>(
rowgroup_char_counts_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
counts, orc_columns, rowgroup_bounds, str_col_indexes);
}

Expand Down Expand Up @@ -104,8 +103,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
device_span<orc_column_device_view const> columns)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const t = threadIdx.x;
auto& dict = dictionaries[col_idx][stripe_idx];
auto const& col = columns[dict.column_idx];
Expand Down Expand Up @@ -166,8 +165,8 @@ template <int block_size>
CUDF_KERNEL void __launch_bounds__(block_size)
collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const& dict = dictionaries[col_idx][stripe_idx];

if (not dict.is_enabled) { return; }
Expand Down Expand Up @@ -200,8 +199,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
device_span<orc_column_device_view const> columns)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const t = threadIdx.x;
auto const& dict = dictionaries[col_idx][stripe_idx];
auto const& col = columns[dict.column_idx];
Expand Down Expand Up @@ -244,18 +243,17 @@ void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 256;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
populate_dictionary_hash_maps_kernel<block_size>
<<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
}

void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
rmm::cuda_stream_view stream)
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 1024;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
collect_map_entries_kernel<block_size><<<dim_grid, block_size, 0, stream.value()>>>(dictionaries);
collect_map_entries_kernel<block_size>
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries);
}

void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
Expand All @@ -264,9 +262,8 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 1024;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
get_dictionary_indices_kernel<block_size>
<<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
}

} // namespace cudf::io::orc::gpu
21 changes: 12 additions & 9 deletions cpp/src/io/orc/stats_enc.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include "io/utilities/block_utils.cuh"
#include "orc_gpu.hpp"

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/orc_types.hpp>
#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>

Expand All @@ -40,8 +40,9 @@ CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
device_2dspan<rowgroup_rows const> rowgroup_bounds)
{
__shared__ __align__(4) statistics_group group_g[init_groups_per_block];
auto const col_id = blockIdx.y;
auto const chunk_id = (blockIdx.x * init_groups_per_block) + threadIdx.y;
auto const col_id = blockIdx.x % rowgroup_bounds.size().second;
auto const chunk_id =
(blockIdx.x / rowgroup_bounds.size().second * init_groups_per_block) + threadIdx.y;
auto const t = threadIdx.x;
auto const num_rowgroups = rowgroup_bounds.size().first;
statistics_group* group = &group_g[threadIdx.y];
Expand Down Expand Up @@ -452,10 +453,12 @@ void orc_init_statistics_groups(statistics_group* groups,
device_2dspan<rowgroup_rows const> rowgroup_bounds,
rmm::cuda_stream_view stream)
{
dim3 dim_grid((rowgroup_bounds.size().first + init_groups_per_block - 1) / init_groups_per_block,
rowgroup_bounds.size().second);
auto const num_blocks =
cudf::util::div_rounding_up_safe<size_t>(rowgroup_bounds.size().first, init_groups_per_block) *
rowgroup_bounds.size().second;

dim3 dim_block(init_threads_per_group, init_groups_per_block);
gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream.value()>>>(
gpu_init_statistics_groups<<<num_blocks, dim_block, 0, stream.value()>>>(
groups, cols, rowgroup_bounds);
}

Expand Down Expand Up @@ -490,8 +493,8 @@ void orc_encode_statistics(uint8_t* blob_bfr,
uint32_t statistics_count,
rmm::cuda_stream_view stream)
{
unsigned int num_blocks =
(statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block;
auto const num_blocks =
cudf::util::div_rounding_up_safe(statistics_count, encode_chunks_per_block);
dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block);
gpu_encode_statistics<<<num_blocks, dim_block, 0, stream.value()>>>(
blob_bfr, groups, chunks, statistics_count);
Expand Down
37 changes: 21 additions & 16 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1288,16 +1288,17 @@ CUDF_KERNEL void __launch_bounds__(block_size)
} temp_storage;

orcdec_state_s* const s = &state_g;
bool const is_nulldec = (blockIdx.y >= num_stripes);
uint32_t const column = blockIdx.x;
uint32_t const stripe = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
// Need the modulo because we have twice as many threads as columns*stripes
uint32_t const column = blockIdx.x / num_stripes;
uint32_t const stripe = blockIdx.x % num_stripes;
uint32_t const chunk_id = stripe * num_columns + column;
int t = threadIdx.x;

if (t == 0) s->chunk = chunks[chunk_id];
__syncthreads();
size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;

bool const is_nulldec = (blockIdx.y == 0);
if (is_nulldec) {
uint32_t null_count = 0;
// Decode NULLs
Expand Down Expand Up @@ -1576,7 +1577,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
auto num_rowgroups = row_groups.size().first;

if (num_rowgroups > 0) {
if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; }
if (t == 0) {
auto const rg_idx = blockIdx.x % num_rowgroups;
shrshi marked this conversation as resolved.
Show resolved Hide resolved
auto const col_idx = blockIdx.x / num_rowgroups;
s->top.data.index = row_groups[rg_idx][col_idx];
}
__syncthreads();
chunk_id = s->top.data.index.chunk_id;
} else {
Expand All @@ -1601,10 +1606,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
atomicAdd(error_count, 1);
}
auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
uint32_t const rg_idx = blockIdx.x % num_rowgroups;
uint32_t rowgroup_rowofs =
(level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride
(level == 0) ? (rg_idx - cuda::std::min(s->chunk.rowgroup_id, rg_idx)) * rowidx_stride
: s->top.data.index.start_row;
;
s->chunk.streams[CI_DATA] += ofs0;
Expand Down Expand Up @@ -2025,7 +2031,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
}
if (t == 0 and (s->chunk.type_kind == LIST or s->chunk.type_kind == MAP)) {
if (num_rowgroups > 0) {
row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
auto const rg_idx = blockIdx.x % num_rowgroups;
auto const col_idx = blockIdx.x / num_rowgroups;
row_groups[rg_idx][col_idx].num_child_rows = s->num_child_rows;
}
cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{chunks[chunk_id].num_child_rows};
ref.fetch_add(s->num_child_rows, cuda::std::memory_order_relaxed);
Expand All @@ -2049,9 +2057,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
int64_t first_row,
rmm::cuda_stream_view stream)
{
dim3 dim_block(block_size, 1);
dim3 dim_grid(num_columns, num_stripes * 2); // 1024 threads per chunk
gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
dim3 dim_grid(num_columns * num_stripes, 2);

gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, block_size, 0, stream.value()>>>(
chunks, global_dictionary, num_columns, num_stripes, first_row);
}

Expand Down Expand Up @@ -2083,11 +2091,8 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
size_type* error_count,
rmm::cuda_stream_view stream)
{
auto const num_chunks = num_columns * num_stripes;
dim3 dim_block(block_size, 1); // 1024 threads per chunk
dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
(num_rowgroups > 0) ? num_rowgroups : 1);
gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
auto const num_blocks = num_columns * (num_rowgroups > 0 ? num_rowgroups : num_stripes);
gpuDecodeOrcColumnData<block_size><<<num_blocks, block_size, 0, stream.value()>>>(
chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
}

Expand Down
Loading
Loading