Skip to content

Commit

Permalink
Add XXHash_32 hasher (#17533)
Browse files Browse the repository at this point in the history
Contributes to #17531

This PR introduces the xxhash_32 hasher to libcudf as a preparatory step for evaluating the impact of replacing murmurhash3_x86_32 with xxhash_32 as the default hash.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #17533
  • Loading branch information
PointKernel authored Jan 7, 2025
1 parent f308122 commit caf97ef
Show file tree
Hide file tree
Showing 17 changed files with 473 additions and 35 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ add_library(
src/hash/sha256_hash.cu
src/hash/sha384_hash.cu
src/hash/sha512_hash.cu
src/hash/xxhash_32.cu
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/arrow_utilities.cpp
Expand Down
22 changes: 21 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_32 hash value of each row in the given table
*
* This function computes the hash of each column using the `seed` for the first column
* and the resulting hash as a seed for the next column and so on.
* The result is a uint32 value for each row.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> xxhash_32(
table_view const& input,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_64 hash value of each row in the given table
*
Expand Down
7 changes: 6 additions & 1 deletion cpp/include/cudf/hashing/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_32(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_64(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
Expand Down
118 changes: 118 additions & 0 deletions cpp/include/cudf/hashing/detail/xxhash_32.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/lists/list_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/structs/struct_view.hpp>
#include <cudf/types.hpp>

#include <cuco/hash_functions.cuh>
#include <cuda/std/cstddef>

namespace cudf::hashing::detail {

template <typename Key>
struct XXHash_32 {
using result_type = std::uint32_t;

CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}

__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
std::uint64_t size) const
{
return this->_impl.compute_hash(bytes, size);
}

private:
template <typename T>
__device__ constexpr result_type compute(T const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
}

cuco::xxhash_32<Key> _impl;
};

template <>
XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
{
return this->compute(static_cast<uint8_t>(key));
}

template <>
XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
double const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<cudf::string_view>::result_type
__device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
key.size_bytes());
}

template <>
XXHash_32<numeric::decimal32>::result_type
__device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal64>::result_type
__device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal128>::result_type
__device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()(
cudf::list_view const& key) const
{
CUDF_UNREACHABLE("List column hashing is not supported");
}

template <>
XXHash_32<cudf::struct_view>::result_type
__device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
{
CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
}

} // namespace cudf::hashing::detail
136 changes: 136 additions & 0 deletions cpp/src/hash/xxhash_32.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/xxhash_32.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/limits>
#include <thrust/tabulate.h>

namespace cudf {
namespace hashing {
namespace detail {

namespace {

/**
* @brief Computes the hash value of a row in the given table.
*
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
*/
template <typename Nullate>
class device_row_hasher {
public:
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
: _check_nulls(nulls), _table(t), _seed(seed)
{
}

__device__ auto operator()(size_type row_index) const noexcept
{
return cudf::detail::accumulate(
_table.begin(),
_table.end(),
_seed,
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
});
}

/**
* @brief Computes the hash value of an element in the given column.
*/
class element_hasher_adapter {
public:
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const& col,
size_type const row_index,
Nullate const _check_nulls,
hash_value_type const _seed) const noexcept
{
if (_check_nulls && col.is_null(row_index)) {
return cuda::std::numeric_limits<hash_value_type>::max();
}
auto const hasher = XXHash_32<T>{_seed};
return hasher(col.element<T>(row_index));
}

template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const&,
size_type const,
Nullate const,
hash_value_type const) const noexcept
{
CUDF_UNREACHABLE("Unsupported type for XXHash_32");
}
};

Nullate const _check_nulls;
table_device_view const _table;
hash_value_type const _seed;
};

} // namespace

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
input.num_rows(),
mask_state::UNALLOCATED,
stream,
mr);

// Return early if there's nothing to hash
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

bool const nullable = has_nulls(input);
auto const input_view = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row
thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<hash_value_type>(),
output_view.end<hash_value_type>(),
device_row_hasher(nullable, *input_view, seed));

return output;
}

} // namespace detail

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::xxhash_32(input, seed, stream, mr);
}

} // namespace hashing
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@

#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/io/orc_types.hpp>
#include <cudf/table/experimental/row_operators.cuh>

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>

#include <rmm/exec_policy.hpp>
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/join/join_common_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/utilities/memory_resource.hpp>

Expand Down
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -192,6 +192,7 @@ ConfigureTest(
hashing/sha256_test.cpp
hashing/sha384_test.cpp
hashing/sha512_test.cpp
hashing/xxhash_32_test.cpp
hashing/xxhash_64_test.cpp
)

Expand Down
Loading

0 comments on commit caf97ef

Please sign in to comment.