Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading bloom filters from Parquet files and filter row groups using them #17289

Open
wants to merge 102 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 87 commits
Commits
Show all changes
102 commits
Select commit Hold shift + click to select a range
95fe8e8
Initial stuff for reading bloom filter from PQ files
mhaseeb123 Nov 9, 2024
4f0e7ab
Minor bug fix
mhaseeb123 Nov 9, 2024
48a50c4
Apply style fix
mhaseeb123 Nov 9, 2024
9a85d08
Merge branch 'branch-24.12' into fea/extract-pq-bloom-filter-data
mhaseeb123 Nov 14, 2024
b71cf9b
Merge branch 'branch-24.12' into fea/extract-pq-bloom-filter-data
mhaseeb123 Nov 15, 2024
68be24f
Some updates
mhaseeb123 Nov 16, 2024
f848251
Move contents to a separate file
mhaseeb123 Nov 16, 2024
0b65233
Revert erroneous changes
mhaseeb123 Nov 16, 2024
cf7d762
Style and doc fix
mhaseeb123 Nov 16, 2024
81efad2
Get equality predicate col indices
mhaseeb123 Nov 19, 2024
088377b
Enable `arrow_filter_policy` and `span` types in bloom filter.
mhaseeb123 Nov 20, 2024
0435bff
Merge branch 'branch-24.12' into fea/extract-pq-bloom-filter-data
mhaseeb123 Nov 20, 2024
3dff590
Successfully search bloom filter
mhaseeb123 Nov 21, 2024
71e1d33
style fix
mhaseeb123 Nov 21, 2024
aa65a2b
Code cleanup
mhaseeb123 Nov 22, 2024
c52821b
add tests
mhaseeb123 Nov 25, 2024
3a20a98
Initial stuff for reading bloom filter from PQ files
mhaseeb123 Nov 9, 2024
d67e4b5
Minor bug fix
mhaseeb123 Nov 9, 2024
10471d4
Apply style fix
mhaseeb123 Nov 9, 2024
1e12662
Some updates
mhaseeb123 Nov 16, 2024
ee7217c
Move contents to a separate file
mhaseeb123 Nov 16, 2024
f8e6159
Revert erroneous changes
mhaseeb123 Nov 16, 2024
1886cab
Style and doc fix
mhaseeb123 Nov 16, 2024
be228b3
Get equality predicate col indices
mhaseeb123 Nov 19, 2024
aaf355e
Enable `arrow_filter_policy` and `span` types in bloom filter.
mhaseeb123 Nov 20, 2024
e92324e
Successfully search bloom filter
mhaseeb123 Nov 21, 2024
0b1719d
style fix
mhaseeb123 Nov 21, 2024
ef3a262
Code cleanup
mhaseeb123 Nov 22, 2024
051be2d
add tests
mhaseeb123 Nov 25, 2024
a12c90e
Merge branch 'fea/extract-pq-bloom-filter-data' of https://github.com…
mhaseeb123 Nov 25, 2024
fb55c3f
Major cleanups
mhaseeb123 Nov 26, 2024
b477d2d
Significant code refactoring
mhaseeb123 Nov 26, 2024
f9f1746
minor style fix
mhaseeb123 Nov 26, 2024
bad484f
refactoring
mhaseeb123 Nov 26, 2024
ce09d43
Minor refactoring
mhaseeb123 Nov 26, 2024
dddee6c
Minor improvements
mhaseeb123 Nov 26, 2024
0cfeb80
Add gtest
mhaseeb123 Nov 26, 2024
9137585
Improvements
mhaseeb123 Nov 26, 2024
77152b4
Support int96 in bloom filter
mhaseeb123 Nov 27, 2024
3984291
Cleanup
mhaseeb123 Nov 27, 2024
9a39aa4
Minor improvements
mhaseeb123 Nov 27, 2024
1def801
Fix minor bug
mhaseeb123 Nov 27, 2024
6edc248
MInor bug fixing
mhaseeb123 Nov 28, 2024
2925f1e
Add python tests
mhaseeb123 Nov 28, 2024
efc6ec0
Correct parquet files
mhaseeb123 Nov 28, 2024
df84aca
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Nov 28, 2024
a2fa784
minor spelling fix
mhaseeb123 Dec 2, 2024
1f5da37
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 2, 2024
fa0cec8
Apply suggestions from code review
mhaseeb123 Dec 2, 2024
7a309c6
Minor bug fix
mhaseeb123 Dec 2, 2024
bcc68c0
Convert to enum class
mhaseeb123 Dec 2, 2024
2dce9b1
Apply suggestion from code review
mhaseeb123 Dec 3, 2024
e03bea0
Suggestions from code reviews
mhaseeb123 Dec 3, 2024
059a9d8
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 3, 2024
4b0b5ed
Apply suggestions from code reviews
mhaseeb123 Dec 4, 2024
c1256b1
Refactor into single table for cudf::compute_column
mhaseeb123 Dec 4, 2024
88bf491
Minor, add const
mhaseeb123 Dec 4, 2024
9ca42c6
Move bloom filter test to parquet test
mhaseeb123 Dec 4, 2024
84c24c1
Minor updates
mhaseeb123 Dec 4, 2024
0c05031
Minor
mhaseeb123 Dec 4, 2024
09560c5
Logical and between bloom filter and stats
mhaseeb123 Dec 4, 2024
21f4412
Revert merging converted AST tables.
mhaseeb123 Dec 4, 2024
442de80
Revert an extra eol
mhaseeb123 Dec 4, 2024
f7952d4
Revert extra eol
mhaseeb123 Dec 4, 2024
4d0c570
Read bloom filter data sync
mhaseeb123 Dec 4, 2024
67c6247
Update cpp/src/io/parquet/bloom_filter_reader.cu
mhaseeb123 Dec 4, 2024
40c80b7
strong type for int96 timestamp
mhaseeb123 Dec 4, 2024
690c165
Merge branch 'fea/extract-pq-bloom-filter-data' of https://github.com…
mhaseeb123 Dec 4, 2024
c5f8150
Remove unused header
mhaseeb123 Dec 4, 2024
7a21a6e
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 6, 2024
4465277
Apply suggestions from code review
mhaseeb123 Dec 9, 2024
3888732
Apply suggestions
mhaseeb123 Dec 9, 2024
8bc8927
Update cpp/src/io/parquet/reader_impl_helpers.hpp
mhaseeb123 Dec 9, 2024
d719e65
Update cpp/src/io/parquet/reader_impl_helpers.hpp
mhaseeb123 Dec 9, 2024
03cf07f
Move equality_literals instead of copying
mhaseeb123 Dec 9, 2024
de94168
Merge branch 'fea/extract-pq-bloom-filter-data' of https://github.com…
mhaseeb123 Dec 9, 2024
c92d326
Minor
mhaseeb123 Dec 9, 2024
82083f9
Use spans instead of passing around vectors
mhaseeb123 Dec 10, 2024
6918a40
Minor
mhaseeb123 Dec 10, 2024
85cdc00
Make `get_equality_literals()` safe again
mhaseeb123 Dec 10, 2024
aa1a909
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 10, 2024
fdf8fc8
Update counting_iterator
mhaseeb123 Dec 10, 2024
10a8f5a
Minor changes
mhaseeb123 Dec 10, 2024
d46504f
Minor
mhaseeb123 Dec 10, 2024
c94ce86
Sync arrow filter policy with cuco
mhaseeb123 Dec 10, 2024
69aa685
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 11, 2024
d95a178
Address partial reviewer comments and fix new logger header
mhaseeb123 Dec 12, 2024
840c6e7
Revert to direct dtype check until I find a way to get scalar from li…
mhaseeb123 Dec 12, 2024
9d8c071
Create a dummy scalar of type T and compare with dtype
mhaseeb123 Dec 12, 2024
3b8aea0
Use a temporary scalar
mhaseeb123 Dec 12, 2024
0c859db
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 12, 2024
c385537
Recalculate `total_row_groups` in apply_bloom_filter
mhaseeb123 Dec 13, 2024
3693ad1
Simplify bloom filter expression with ast::tree and handle non-equali…
mhaseeb123 Dec 13, 2024
c2de9fb
Apply suggestions from code review
mhaseeb123 Dec 13, 2024
344851c
Minor optimization: Set `have_bloom_filters` while populating `bloom_…
mhaseeb123 Dec 14, 2024
96fb7c2
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 14, 2024
4522afa
Add pytest to test logical or with non == expr
mhaseeb123 Dec 16, 2024
ed66593
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 17, 2024
f509148
Remove temporary arrow_filter_policy.cuh and use cuco directly.
mhaseeb123 Dec 17, 2024
c8cd646
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 17, 2024
4194d30
MInor style fix
mhaseeb123 Dec 17, 2024
8b7baff
Merge branch 'branch-25.02' into fea/extract-pq-bloom-filter-data
mhaseeb123 Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ add_library(
src/datetime/timezone.cpp
src/io/orc/writer_impl.cu
src/io/parquet/arrow_schema_writer.cpp
src/io/parquet/bloom_filter_reader.cu
src/io/parquet/compact_protocol_reader.cpp
src/io/parquet/compact_protocol_writer.cpp
src/io/parquet/decode_preprocess.cu
Expand Down
182 changes: 182 additions & 0 deletions cpp/src/io/parquet/arrow_filter_policy.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to review this file since it has been copied over from cuco as is. Will be deleted once we get a cuco bump in libcudf.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be removed once rapidsai/rapids-cmake#723 is merged

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be removed now as the corresponding rapids-cmake PR has been merged.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cheers! Let me update the PR!

* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/hash_functions.cuh>
#include <cuda/functional>
#include <cuda/std/bit>
#include <cuda/std/limits>

#include <cstdint>
#include <nv/target>

namespace cuco {

/**
* @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
* fingerprint.
*
* Reference:
* https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230
*
* Example:
* @code{.cpp}
* template <typename KeyType, std::uint32_t NUM_FILTER_BLOCKS>
* void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector<KeyType> const& positive_keys,
* device_vector<KeyType> const& negative_keys)
* {
* using policy_type = cuco::arrow_filter_policy<KeyType, cuco::xxhash_64>;
*
* // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy.
* static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be
* in range: [1, 4194304]");
*
* // Create a bloom filter with Arrow policy
* cuco::bloom_filter<KeyType, cuco::extent<size_t>,
* cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS};
*
* // Add positive keys to the bloom filter
* filter.add(positive_keys.begin(), positive_keys.end());
*
* auto const num_tp = positive_keys.size();
* auto const num_tn = negative_keys.size();
*
* // Vectors to store query results.
* thrust::device_vector<bool> true_positive_result(num_tp, false);
* thrust::device_vector<bool> true_negative_result(num_tn, false);
*
* // Query the bloom filter for the inserted keys.
* filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin());
*
* // We should see a true-positive rate of 1.
* float true_positive_rate = float(thrust::count(thrust::device,
* true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp);
*
* // Query the bloom filter for the non-inserted keys.
* filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin());
*
* // We may see a false-positive rate > 0 depending on the number of bits in the
* // filter and the number of hashes used per key.
* float false_positive_rate = float(thrust::count(thrust::device,
* true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn);
* }
* @endcode
*
* @tparam Key The type of the values to generate a fingerprint for.
* @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation.
*/
template <class Key, template <typename> class XXHash64>
class arrow_filter_policy {
public:
using hasher = XXHash64<Key>; ///< 64-bit XXHash hasher for Arrow bloom filter policy
using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy
using key_type = Key; ///< Hash function input type
using hash_value_type = std::uint64_t; ///< hash function output type

static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block
static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block

static constexpr std::uint32_t bytes_per_filter_block =
32; ///< Number of bytes in one Arrow filter block
static constexpr std::uint32_t max_arrow_filter_bytes =
128 * 1024 * 1024; ///< Max bytes in Arrow bloom filter
static constexpr std::uint32_t max_filter_blocks =
(max_arrow_filter_bytes /
bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter

private:
// Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate
// eight indexes of bit to set, one bit in each 32-bit (uint32_t) word.
__device__ static constexpr cuda::std::array<std::uint32_t, 8> SALT()
{
return {0x47b6137bU,
0x44974d91U,
0x8824ad5bU,
0xa2b7289dU,
0x705495c7U,
0x2df1424bU,
0x9efc4947U,
0x5c6bfb31U};
}

public:
/**
* @brief Constructs the `arrow_filter_policy` object.
*
* @note The number of filter blocks with Arrow policy must be in the
* range of [1, 4194304]. If the bloom filter is constructed with a larger
* number of blocks, only the first 4194304 (128MB) blocks will be used.
*
* @param hash Hash function used to generate a key's fingerprint
*/
__host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {}

/**
* @brief Generates the hash value for a given key.
*
* @param key The key to hash
*
* @return The hash value of the key
*/
__device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); }

/**
* @brief Determines the filter block a key is added into.
*
* @note The number of filter blocks with Arrow policy must be in the
* range of [1, 4194304]. Passing a larger `num_blocks` will still
* upperbound the number of blocks used to the mentioned range.
*
* @tparam Extent Size type that is used to determine the number of blocks in the filter
*
* @param hash Hash value of the key
* @param num_blocks Number of block in the filter
*
* @return The block index for the given key's hash value
*/
template <class Extent>
__device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const
{
constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
// TODO: assert if num_blocks > max_filter_blocks
auto const max_blocks = cuda::std::min<Extent>(num_blocks, max_filter_blocks);
// Make sure we are only contained withing the `max_filter_blocks` blocks
return static_cast<word_type>(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks;
}

/**
* @brief Determines the fingerprint pattern for a word/segment within the filter block for a
* given key's hash value.
*
* @param hash Hash value of the key
* @param word_index Target word/segment within the filter block
*
* @return The bit pattern for the word/segment in the filter block
*/
__device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const
{
// SALT array to calculate bit indexes for the current word
auto constexpr salt = SALT();
word_type const key = static_cast<word_type>(hash);
return word_type{1} << ((key * salt[word_index]) >> 27);
}

private:
hasher hash_;
};

} // namespace cuco
Loading
Loading