-
Notifications
You must be signed in to change notification settings - Fork 918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Do not Review] Support hyper log log plus plus(HLL++) #17133
Closed
Closed
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
06512e9
Support hyper log log plus plus(HLL++)
abb4cad
Improve: use shared memory
77ea21c
Reduction for hllpp
d3b6066
Improve reduction
57efaff
Refine code; Add comments
57be29e
Adjust configs to get better performance
51ead98
Update code comments; Minor changes
8e0ff01
Format code
884efe0
Use has_nested_nulls; fix compile error
41f4ea2
update xxhash64 for hllpp
59ee807
Merge branch 'branch-25.02' into hll-new
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
cpp/include/cudf/detail/hyper_log_log_plus_plus/hyper_log_log_plus_plus.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
/* | ||
* Copyright (c) 2021-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <cudf/types.hpp> | ||
#include <cudf/utilities/memory_resource.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
namespace cudf { | ||
namespace groupby::detail { | ||
|
||
/** | ||
* Compute the hashs of the input column, then generate a scalar that is a sketch in long array | ||
* format | ||
*/ | ||
std::unique_ptr<scalar> reduce_hyper_log_log_plus_plus(column_view const& input, | ||
int64_t const precision, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr); | ||
|
||
/** | ||
* Merge sketches in long array format, and compute the estimated distinct value(long) | ||
* Input is a struct column with multiple long columns which is consistent with Spark. | ||
*/ | ||
std::unique_ptr<scalar> reduce_merge_hyper_log_log_plus_plus(column_view const& input, | ||
int64_t const precision, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr); | ||
|
||
} // namespace groupby::detail | ||
} // namespace cudf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
/* | ||
* Copyright (c) 2023-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include <cudf/column/column_factories.hpp> | ||
#include <cudf/detail/nvtx/ranges.hpp> | ||
#include <cudf/detail/utilities/algorithm.cuh> | ||
#include <cudf/hashing/detail/hashing.hpp> | ||
#include <cudf/hashing/detail/xxhash_64.cuh> | ||
#include <cudf/table/table_device_view.cuh> | ||
#include <cudf/utilities/memory_resource.hpp> | ||
#include <cudf/utilities/span.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/exec_policy.hpp> | ||
|
||
#include <thrust/tabulate.h> | ||
|
||
/** | ||
* This file is for HyperLogLogPlusPlus, it returns seed when input is null. | ||
* This is a temp file, TODO use xxhash_64 in JNI repo to handle NaN Inf like Spark does. | ||
*/ | ||
namespace cudf::hashing::detail { | ||
|
||
using hash_value_type = uint64_t; | ||
|
||
/** | ||
* @brief Computes the hash value of a row in the given table. | ||
* | ||
* @tparam Nullate A cudf::nullate type describing whether to check for nulls. | ||
*/ | ||
template <typename Nullate> | ||
class xxhash_64_hllpp_row_hasher { | ||
public: | ||
xxhash_64_hllpp_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) | ||
: _check_nulls(nulls), _table(t), _seed(seed) | ||
{ | ||
} | ||
|
||
__device__ auto operator()(size_type row_index) const noexcept | ||
{ | ||
return cudf::detail::accumulate( | ||
_table.begin(), | ||
_table.end(), | ||
_seed, | ||
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) { | ||
return cudf::type_dispatcher( | ||
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); | ||
}); | ||
} | ||
|
||
/** | ||
* @brief Computes the hash value of an element in the given column. | ||
*/ | ||
class element_hasher_adapter { | ||
public: | ||
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())> | ||
__device__ hash_value_type operator()(column_device_view const& col, | ||
size_type const row_index, | ||
Nullate const _check_nulls, | ||
hash_value_type const _seed) const noexcept | ||
{ | ||
if (_check_nulls && col.is_null(row_index)) { return _seed; } | ||
auto const hasher = XXHash_64<T>{_seed}; | ||
return hasher(col.element<T>(row_index)); | ||
} | ||
|
||
template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())> | ||
__device__ hash_value_type operator()(column_device_view const&, | ||
size_type const, | ||
Nullate const, | ||
hash_value_type const) const noexcept | ||
{ | ||
CUDF_UNREACHABLE("Unsupported type for XXHash_64"); | ||
} | ||
}; | ||
|
||
Nullate const _check_nulls; | ||
table_device_view const _table; | ||
hash_value_type const _seed; | ||
}; | ||
|
||
} // namespace cudf::hashing::detail |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit:
hashs
-->hashes