-
Notifications
You must be signed in to change notification settings - Fork 916
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Contributes to #17531 This PR introduces the xxhash_32 hasher to libcudf as a preparatory step for evaluating the impact of replacing murmurhash3_x86_32 with xxhash_32 as the default hash. Authors: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: #17533
- Loading branch information
1 parent
f308122
commit caf97ef
Showing
17 changed files
with
473 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
/* | ||
* Copyright (c) 2025, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/fixed_point/fixed_point.hpp> | ||
#include <cudf/hashing.hpp> | ||
#include <cudf/hashing/detail/hash_functions.cuh> | ||
#include <cudf/lists/list_view.hpp> | ||
#include <cudf/strings/string_view.cuh> | ||
#include <cudf/structs/struct_view.hpp> | ||
#include <cudf/types.hpp> | ||
|
||
#include <cuco/hash_functions.cuh> | ||
#include <cuda/std/cstddef> | ||
|
||
namespace cudf::hashing::detail { | ||
|
||
template <typename Key> | ||
struct XXHash_32 { | ||
using result_type = std::uint32_t; | ||
|
||
CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} | ||
|
||
__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } | ||
|
||
__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, | ||
std::uint64_t size) const | ||
{ | ||
return this->_impl.compute_hash(bytes, size); | ||
} | ||
|
||
private: | ||
template <typename T> | ||
__device__ constexpr result_type compute(T const& key) const | ||
{ | ||
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T)); | ||
} | ||
|
||
cuco::xxhash_32<Key> _impl; | ||
}; | ||
|
||
template <> | ||
XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const | ||
{ | ||
return this->compute(static_cast<uint8_t>(key)); | ||
} | ||
|
||
template <> | ||
XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const | ||
{ | ||
return this->compute(normalize_nans_and_zeros(key)); | ||
} | ||
|
||
template <> | ||
XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()( | ||
double const& key) const | ||
{ | ||
return this->compute(normalize_nans_and_zeros(key)); | ||
} | ||
|
||
template <> | ||
XXHash_32<cudf::string_view>::result_type | ||
__device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const | ||
{ | ||
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()), | ||
key.size_bytes()); | ||
} | ||
|
||
template <> | ||
XXHash_32<numeric::decimal32>::result_type | ||
__device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const | ||
{ | ||
return this->compute(key.value()); | ||
} | ||
|
||
template <> | ||
XXHash_32<numeric::decimal64>::result_type | ||
__device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const | ||
{ | ||
return this->compute(key.value()); | ||
} | ||
|
||
template <> | ||
XXHash_32<numeric::decimal128>::result_type | ||
__device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const | ||
{ | ||
return this->compute(key.value()); | ||
} | ||
|
||
template <> | ||
XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()( | ||
cudf::list_view const& key) const | ||
{ | ||
CUDF_UNREACHABLE("List column hashing is not supported"); | ||
} | ||
|
||
template <> | ||
XXHash_32<cudf::struct_view>::result_type | ||
__device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const | ||
{ | ||
CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); | ||
} | ||
|
||
} // namespace cudf::hashing::detail |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
/* | ||
* Copyright (c) 2025, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include <cudf/column/column_factories.hpp> | ||
#include <cudf/detail/nvtx/ranges.hpp> | ||
#include <cudf/detail/utilities/algorithm.cuh> | ||
#include <cudf/hashing/detail/hashing.hpp> | ||
#include <cudf/hashing/detail/xxhash_32.cuh> | ||
#include <cudf/table/table_device_view.cuh> | ||
#include <cudf/utilities/memory_resource.hpp> | ||
#include <cudf/utilities/span.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/exec_policy.hpp> | ||
|
||
#include <cuda/std/limits> | ||
#include <thrust/tabulate.h> | ||
|
||
namespace cudf { | ||
namespace hashing { | ||
namespace detail { | ||
|
||
namespace { | ||
|
||
/** | ||
* @brief Computes the hash value of a row in the given table. | ||
* | ||
* @tparam Nullate A cudf::nullate type describing whether to check for nulls. | ||
*/ | ||
template <typename Nullate> | ||
class device_row_hasher { | ||
public: | ||
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) | ||
: _check_nulls(nulls), _table(t), _seed(seed) | ||
{ | ||
} | ||
|
||
__device__ auto operator()(size_type row_index) const noexcept | ||
{ | ||
return cudf::detail::accumulate( | ||
_table.begin(), | ||
_table.end(), | ||
_seed, | ||
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) { | ||
return cudf::type_dispatcher( | ||
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); | ||
}); | ||
} | ||
|
||
/** | ||
* @brief Computes the hash value of an element in the given column. | ||
*/ | ||
class element_hasher_adapter { | ||
public: | ||
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())> | ||
__device__ hash_value_type operator()(column_device_view const& col, | ||
size_type const row_index, | ||
Nullate const _check_nulls, | ||
hash_value_type const _seed) const noexcept | ||
{ | ||
if (_check_nulls && col.is_null(row_index)) { | ||
return cuda::std::numeric_limits<hash_value_type>::max(); | ||
} | ||
auto const hasher = XXHash_32<T>{_seed}; | ||
return hasher(col.element<T>(row_index)); | ||
} | ||
|
||
template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())> | ||
__device__ hash_value_type operator()(column_device_view const&, | ||
size_type const, | ||
Nullate const, | ||
hash_value_type const) const noexcept | ||
{ | ||
CUDF_UNREACHABLE("Unsupported type for XXHash_32"); | ||
} | ||
}; | ||
|
||
Nullate const _check_nulls; | ||
table_device_view const _table; | ||
hash_value_type const _seed; | ||
}; | ||
|
||
} // namespace | ||
|
||
std::unique_ptr<column> xxhash_32(table_view const& input, | ||
uint32_t seed, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()), | ||
input.num_rows(), | ||
mask_state::UNALLOCATED, | ||
stream, | ||
mr); | ||
|
||
// Return early if there's nothing to hash | ||
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } | ||
|
||
bool const nullable = has_nulls(input); | ||
auto const input_view = table_device_view::create(input, stream); | ||
auto output_view = output->mutable_view(); | ||
|
||
// Compute the hash value for each row | ||
thrust::tabulate(rmm::exec_policy(stream), | ||
output_view.begin<hash_value_type>(), | ||
output_view.end<hash_value_type>(), | ||
device_row_hasher(nullable, *input_view, seed)); | ||
|
||
return output; | ||
} | ||
|
||
} // namespace detail | ||
|
||
std::unique_ptr<column> xxhash_32(table_view const& input, | ||
uint32_t seed, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
CUDF_FUNC_RANGE(); | ||
return detail::xxhash_32(input, seed, stream, mr); | ||
} | ||
|
||
} // namespace hashing | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.