Skip to content

Commit

Permalink
Use 64-bit offsets only if the current strings column output chunk si…
Browse files Browse the repository at this point in the history
…ze exceeds threshold (#17693)

This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #17693
  • Loading branch information
mhaseeb123 authored Jan 10, 2025
1 parent a8a4197 commit 559cda2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 36 deletions.
48 changes: 16 additions & 32 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
_stream);
}

// Compute column string sizes (using page string offsets) for this subpass
// Compute column string sizes (using page string offsets) for this output table chunk
col_string_sizes = calculate_page_string_offsets();

// ensure cumulative column string sizes have been initialized
if (pass.cumulative_col_string_sizes.empty()) {
pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
}

// Add to the cumulative column string sizes of this pass
std::transform(pass.cumulative_col_string_sizes.begin(),
pass.cumulative_col_string_sizes.end(),
col_string_sizes.begin(),
pass.cumulative_col_string_sizes.begin(),
std::plus<>{});

// Check for overflow in cumulative column string sizes of this pass so that the page string
// offsets of overflowing (large) string columns are treated as 64-bit.
auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
pass.cumulative_col_string_sizes.cend(),
auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
col_string_sizes.cend(),
[=](std::size_t sz) { return sz > threshold; });
if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
}

// Mark any chunks for which the cumulative column string size has exceeded the
// large strings threshold
if (has_large_strings) {
for (auto& chunk : pass.chunks) {
auto const idx = chunk.src_col_index;
if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
}
// Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
// column chunks and the large strings threshold.
for (auto& chunk : pass.chunks) {
auto const idx = chunk.src_col_index;
chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
}
}

Expand Down Expand Up @@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
// only do string buffer for leaf
if (idx == max_depth - 1 and out_buf.string_size() == 0 and
col_string_sizes[pass.chunks[c].src_col_index] > 0) {
out_buf.create_string_data(
col_string_sizes[pass.chunks[c].src_col_index],
pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
static_cast<size_t>(strings::detail::get_offset64_threshold()),
_stream);
out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
pass.chunks[c].is_large_string_col,
_stream);
}
if (has_strings) { str_data[idx] = out_buf.string_data(); }
out_buf.user_data |=
Expand Down Expand Up @@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
final_offsets.emplace_back(offset);
out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
} else if (out_buf.type.id() == type_id::STRING) {
// need to cap off the string offsets column
auto const sz = static_cast<size_type>(col_string_sizes[idx]);
if (sz <= strings::detail::get_offset64_threshold()) {
// only if it is not a large strings column
if (col_string_sizes[idx] <=
static_cast<size_t>(strings::detail::get_offset64_threshold())) {
out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
final_offsets.emplace_back(sz);
final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
}
}
}
Expand Down
5 changes: 1 addition & 4 deletions cpp/src/io/parquet/reader_impl_chunking.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -130,9 +130,6 @@ struct pass_intermediate_data {
rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};

// cumulative strings column sizes.
std::vector<size_t> cumulative_col_string_sizes{};

int level_type_size{0};

// skip_rows / num_rows for this pass.
Expand Down

0 comments on commit 559cda2

Please sign in to comment.