Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run get_json_object_multiple_paths with thread-parallel kernel based on number of paths #2256

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 12 additions & 20 deletions src/main/cpp/src/get_json_object.cu
Original file line number Diff line number Diff line change
Expand Up @@ -908,8 +908,8 @@ void launch_kernel(bool exec_thread_parallel,
// the performance is really bad. This essentially tells NVCC to prefer using lots
// of registers over spilling.
if (exec_thread_parallel) {
constexpr int block_size = 256;
constexpr int min_block_per_sm = 1;
constexpr int block_size = 128;
constexpr int min_block_per_sm = 8;
auto const num_blocks =
cudf::util::div_rounding_up_safe(static_cast<std::size_t>(input.size()) * path_data.size(),
static_cast<std::size_t>(block_size));
Expand Down Expand Up @@ -1027,21 +1027,14 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
auto const [d_json_paths, h_json_paths, d_inst_names, h_inst_names] =
construct_path_commands(json_paths, stream);

auto const [max_row_size, sum_row_size] =
thrust::transform_reduce(rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(input.size()),
cuda::proclaim_return_type<thrust::pair<int64_t, int64_t>>(
[in_offsets] __device__(auto const idx) {
auto const size = in_offsets[idx + 1] - in_offsets[idx];
return thrust::pair<int64_t, int64_t>{size, size};
}),
thrust::pair<int64_t, int64_t>{0, 0},
cuda::proclaim_return_type<thrust::pair<int64_t, int64_t>>(
[] __device__(auto const& lhs, auto const& rhs) {
return thrust::pair<int64_t, int64_t>{
std::max(lhs.first, rhs.first), lhs.second + rhs.second};
}));
auto const max_row_size = thrust::transform_reduce(
rmm::exec_policy(stream),
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(input.size()),
cuda::proclaim_return_type<int64_t>(
[in_offsets] __device__(auto const idx) { return in_offsets[idx + 1] - in_offsets[idx]; }),
int64_t{0},
thrust::maximum{});

// We will use scratch buffers to store the output strings without knowing their sizes.
// Since we do not know their sizes, we need to allocate the buffer a bit larger than the input
Expand Down Expand Up @@ -1082,9 +1075,8 @@ std::vector<std::unique_ptr<cudf::column>> get_json_object(
rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0);

// Threshold to decide on using thread parallel or warp parallel algorithms.
constexpr int64_t AVG_CHAR_BYTES_THRESHOLD = 256;
auto const exec_thread_parallel =
(sum_row_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD;
constexpr std::size_t PATH_SIZE_THRESHOLD = 8;
auto const exec_thread_parallel = json_paths.size() >= PATH_SIZE_THRESHOLD;
launch_kernel(exec_thread_parallel, *d_input_ptr, d_path_data, stream);

// Do not use parallel check since we do not have many elements.
Expand Down
Loading