Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Hive hash computation for nested types #2720

Open
wants to merge 44 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
92921f1
Optimize Hive hash computation for nested types
ustcfy Dec 23, 2024
a77dff5
Fix minor details
ustcfy Dec 23, 2024
97b9644
Fix minor details
ustcfy Dec 23, 2024
1857bdd
Fix minor details
ustcfy Dec 23, 2024
ad84406
Fix minor details
ustcfy Dec 23, 2024
a3fe1c7
Minor update
ustcfy Dec 24, 2024
9b43b03
Fix: Resolve string bug
ustcfy Dec 24, 2024
71f7df9
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 25, 2024
9408c5d
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 30, 2024
33f78ba
Fix minor details
ustcfy Dec 30, 2024
c926fd1
Code format
ustcfy Dec 30, 2024
b5f8fe9
Update col_stack_frame array elements in place
ustcfy Dec 30, 2024
3e0e81a
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 30, 2024
9c64ebe
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 30, 2024
5411808
Update benchmark
ustcfy Dec 30, 2024
e33eca1
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 31, 2024
6200bc2
Update src/main/cpp/benchmarks/hash.cu
ustcfy Dec 31, 2024
0784d7a
Update src/main/cpp/benchmarks/hash.cu
ustcfy Dec 31, 2024
786a2d9
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 31, 2024
f9f037e
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 31, 2024
d9dbf2f
Remove unused timer in benchmark
ustcfy Dec 31, 2024
d49ac7c
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 31, 2024
45b6922
Update src/main/cpp/src/hive_hash.cu
ustcfy Dec 31, 2024
f0a3f7e
Address some comments
ustcfy Dec 31, 2024
a6ba2d3
Address some comments
ustcfy Dec 31, 2024
6f8c1f6
Use pre-order traversal
ustcfy Jan 2, 2025
eebf0f6
Update license
ustcfy Jan 2, 2025
1acea7d
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 3, 2025
244215f
Update src/main/cpp/src/hive_hash.cu
ttnghia Jan 3, 2025
51c194e
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 3, 2025
f3084c9
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 3, 2025
6499308
Code format
ustcfy Jan 3, 2025
b785cc9
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 6, 2025
6c73126
Address some comments
ustcfy Jan 6, 2025
47b6d57
Remove class element_hasher_adapter
ustcfy Jan 7, 2025
7cc37c6
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 7, 2025
9688edd
Update src/main/cpp/src/hive_hash.cu
ustcfy Jan 7, 2025
25fd602
Add test for corner cases
ustcfy Jan 7, 2025
73aef29
Update license
ustcfy Jan 7, 2025
57854b1
Add test for corner cases
ustcfy Jan 7, 2025
5f50d70
Fix corner case when struct's num_children or list's num_element is 0
ustcfy Jan 8, 2025
ef50293
Make the code more readable
ustcfy Jan 9, 2025
90426b6
Add constructor for col_info
ustcfy Jan 14, 2025
b8744cd
Combine check_nested_depth and flatten_table
ustcfy Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/main/cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -81,5 +81,8 @@ ConfigureBench(BLOOM_FILTER_BENCH
ConfigureBench(GET_JSON_OBJECT_BENCH
get_json_object.cu)

ConfigureBench(HASH_BENCH
hash.cu)

ConfigureBench(PARSE_URI_BENCH
parse_uri.cpp)
69 changes: 69 additions & 0 deletions src/main/cpp/benchmarks/hash.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf_test/column_utilities.hpp>

#include <cudf/io/types.hpp>

#include <hash.hpp>
#include <nvbench/nvbench.cuh>

constexpr auto min_width = 10;
constexpr auto max_width = 10;
ustcfy marked this conversation as resolved.
Show resolved Hide resolved

static void hash(nvbench::state& state)
{
auto const size_bytes = static_cast<std::size_t>(state.get_int64("size_bytes"));
auto const max_depth = static_cast<cudf::size_type>(state.get_int64("max_depth"));

auto const bench_structs = true;
auto const input_table = [&] {
if (bench_structs) {
data_profile const table_profile =
data_profile_builder().no_validity().struct_depth(max_depth).struct_types(
std::vector<cudf::type_id>{
cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING});
return create_random_table(
{cudf::type_id::STRUCT}, table_size_bytes{size_bytes}, table_profile);
} else {
data_profile const table_profile =
data_profile_builder()
.no_validity()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
.distribution(cudf::type_id::LIST, distribution_id::NORMAL, min_width, max_width)
.list_depth(max_depth)
.list_type(cudf::type_id::STRING);
return create_random_table(
{cudf::type_id::LIST}, table_size_bytes{size_bytes}, table_profile);
}
}();

auto const stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
// `hive_hash` can be substituted with other hash functions
spark_rapids_jni::hive_hash(*input_table);
});
state.add_global_memory_reads<nvbench::int8_t>(size_bytes);
}

NVBENCH_BENCH(hash)
.set_name("hash")
.add_int64_axis("size_bytes",
{50'000'000, 100'000'000, 500'000'000, 1'000'000'000}) // 50MB, 100MB, 500MB, 1GB
.add_int64_axis("max_depth", {1, 2, 4, 8});
Loading
Loading