From a927b1aa8a3d02d8e3632621c8f8eeb83eb0a131 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 29 Oct 2024 20:25:03 +0000 Subject: [PATCH] Remove Arrow policy example and add a @code docstring to demo the policy --- README.md | 3 +- examples/CMakeLists.txt | 1 - examples/bloom_filter/arrow_policy_example.cu | 94 ------------------- .../bloom_filter/arrow_filter_policy.cuh | 41 ++++++++ 4 files changed, 42 insertions(+), 97 deletions(-) delete mode 100644 examples/bloom_filter/arrow_policy_example.cu diff --git a/README.md b/README.md index ea58d09d2..bbc6fc18b 100644 --- a/README.md +++ b/README.md @@ -254,5 +254,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries. #### Examples: -- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw)) -- [Host-bulk APIs (Arrow fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/arrow_policy_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydV2tzGjcU_Ssa8qGQAAtOnIzxY0IebZlm7NR2k8mUDtFqtaDJIm21WmOayX_vuVotLJhkOsUfDKv7PPfce7VfW4UsCmV00Rr9-bWlktZo2G1lXM9LPpetUUuUCW91W4UpraDf0eOpZo_Za5OvrZovHGuLDjsaHD3rsssPkzeTMXt9df3-6np8O7m67JOsl3-nhNSFTFipE2mZW0g2zrnAv3DSZR-kpUDYUX_A2iQwbYWzaatz6q2sTcmWfM20cawsJMyogqUqk0zeC5k7pjQTZplnimsh2Uq5hXcV7Phw2KdgxMSOQ55DI8evtCnJuNuETp-Fc_koilarVZ_7sPvGzqOsEi6id5PXby9v3vYQ-kbtD50BWWbl36WySDxeM54jMsFjxJvxFTOW8bmVOHOGIl9Z5ZSed1lhUrfiVno7iSqcVXHpdsCr40T-TQHAxzWAG9-wyc20xV6NbyY3XW_n4-T216s_btnH8fX1-PJ28vaGXV2jWJdvJlQq_PqZjS8_sd8ml2-6TAI6uJL3uaUsEKoiWGVSYXgj5U4YqanCKnIpVKoEqxnE5uZOWo20WC7tUlVcQ5CJt5OppXLc-WcPkvOuoqme6kdKi6xMJDsTpTBRnBmznKHuTtq-KBcXuzJuYcvCRcKU2vXp8MFRIu_gYnYnhTP2sIi8l6KkwGa5QdHWh6UKVFeCav39GJRBUSRf-sfR44oSLz1VubVmFazO5D0HrhJZVBKxVTJlb-QSiDjLnQQuBeEIfqKyXrdHFAdoxCMPBaug2FBvAsEkURQ9UYsgTWTKy8yxyi3jagm2cMfAZQUwUJ5QPiJRirqgtb0tVA3FXfp-InPeFqHXZZ9F-dpkGUCk-n2GXooWpjA_VzlWYYVUP3tzbkFOs8ysKDESR4CSFypbMwHAHPDZSarSWIInovDeY7ngd8r4bOuu3VFAqkojDXRUGDFjimbkFaiRC3TyHPwu4z6QjKqGjnzIRK04iuUwEUl88jwePB08f3oiBvHx85PjY_FMPD9KTxJ6nj578TQSeR4VVkQ5t-CB2yOmePTuaHjSe3f0dLAdhC8xuyS7pcEVak89TP2TbIpOdctLmxvMFoCerfts4kiM5p7STqJT_NAoFmblAViCbrvlcmyFGQeZpCJA7Yxnc4M5s1huuwsWMRCVbt8ZlXSm-itOyA0GI-KhEcB0uZx9keuCZuE5Gw5-GgwGp2zziaLojP0itSTGhmNG8octubxSO9_afcwG_ePTYGmC5rfOh50qWzi24Fnq7ZExEw48yN9xoB846AXPpw81ijIONSugceQz83HgKx326kOAu9dwZCyKKDTm1nk1B_daEjSn9oLIzIuck_fTjepHWlNZFtZZ4GpN5dCt-1bxT8-lzS3lMa9gB2e2zsJwCf5oZo5GB1ryrA7q4jRE82qn80jd79ADQW2dBZOVs6lHvvLY7IaNq244lPegsTsr1D9y5i7oacJHI4xVyZNZgZUsZ_WcaWRzsQXuJucrvT8sfLQ-082k08l-HT1pmlFX3782ePAt-ClcMhphkTh2doa1-qrMvqB8np2eiQ-d11AdKlCIaQRLU80aHxj3ntDYWfBcLZjRaGdVbQvmu6Fds9tfjzYq9VJq01E_lnO0dqfrNfrwQN-HneCGl9RP-cxLgStNldOmABR9Q21knzQbqpLSGzOVws5RbWAbxraWhxs-DHevsC1ZH6utXUfRDZ46P0QtNia7IEncZbAE21XgXZbyrJC72B1U1E3FnWnSsBFS-b2Udt2YUJurES5SWFtlgUVXMQgjfJMaFP1KwEAvs4RV7pi_nTpbyh5WAdb5nWR-wAKY2_fX58MmKhhndJ0tHkDT3SZeF_a_hOvTrLa1lX7x0E0QLUtR7Q1g6kZFF4RmOxReMbcm5hluqLgTJtxx8NyWwpWw1W2YCVbk_ULFykG1wnUv75_fX18MsCdzpEWTJ9xHUIoYkQMVr6vq3BpZ0RTYFV3wYuEvVygDzXXkexhOvYFTBzj1d-BMMwO4CG0KNoxC_7Bd08vfSdu7ZDtQoOaj0LFEg06HRcFgRb-Ku5Xf9H_43c-k-egHfnXn8HgkUuKlA99rEPxjqlt4nm6f7w88K8EKzQb4-W2q8cpJt1zcl-32zbSl74QYHh2XQxyb3FWvra0eTJ2LJ0-GL1iPW7E4L5azFwPW62G7u55fNLgv9TK-jP27bKbihk0hRIaHd9XbJx4gY_2l9a1bn2Nd7ZwDvda3v_zfv3UOROY=)) \ No newline at end of file +- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw)) \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 36a5964a8..8e7be947a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -47,4 +47,3 @@ ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/ ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu") -ConfigureExample(BLOOM_FILTER_ARROW_POLICY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/arrow_policy_example.cu") diff --git a/examples/bloom_filter/arrow_policy_example.cu b/examples/bloom_filter/arrow_policy_example.cu deleted file mode 100644 index ce3c2a720..000000000 --- a/examples/bloom_filter/arrow_policy_example.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -/** - * @file arrow_policy_example.cu - * @brief Demonstrates usage of an arrow-compatible bloom filter - * - * In addition to the default policy aimed at achieving the speed of light - * performance on the device, `cuCollections` offers an `arrow_filter_policy` - * that allows users to easily create a bloom filter that mimics the behavior - * of the bloom filter defined in Apache Arrow: - * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230. - * - * @note This example is for demonstration purposes only. It is not intended to show the most - * performant way to do the example algorithm. - */ - -int main(void) -{ - int constexpr num_keys = 10'000; ///< Generate 10'000 keys - int constexpr num_tp = num_keys * 0.5; ///< Insert the first half keys into the filter. - int constexpr num_tn = num_keys - num_tp; - int constexpr sub_filters = 200; ///< 200 sub-filters per bloom filter - - // key type for bloom filter - using key_type = int; - - // We will use the Arrow filter policy for bloom filter fingerprint generation - using policy_type = cuco::arrow_filter_policy; - // Bloom filter type with Arrow filter policy - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - // Spawn a bloom filter with arrow policy and 200 sub-filters. - filter_type filter{sub_filters}; - - std::cout << "Bulk insert into bloom filter with Arrow fingerprint generation policy: " - << std::endl; - - thrust::device_vector keys(num_keys); - thrust::sequence(keys.begin(), keys.end(), 1); - - auto tp_begin = keys.begin(); - auto tp_end = tp_begin + num_tp; - auto tn_begin = tp_end; - auto tn_end = keys.end(); - - // Insert the first half of the keys. - filter.add(tp_begin, tp_end); - - thrust::device_vector tp_result(num_tp, false); - thrust::device_vector tn_result(num_keys - num_tp, false); - - // Query the filter for the previously inserted keys. - // This should result in a true-positive rate of TPR=1. - filter.contains(tp_begin, tp_end, tp_result.begin()); - - // Query the filter for the keys that are not present in the filter. - // Since bloom filters are probalistic data structures, the filter - // exhibits a false-positive rate FPR>0 depending on the number of bits in - // the filter and the number of hashes used per key. - filter.contains(tn_begin, tn_end, tn_result.begin()); - - float tp_rate = - float(thrust::count(thrust::device, tp_result.begin(), tp_result.end(), true)) / float(num_tp); - float fp_rate = - float(thrust::count(thrust::device, tn_result.begin(), tn_result.end(), true)) / float(num_tn); - - std::cout << "TPR=" << tp_rate << " FPR=" << fp_rate << std::endl; - - return 0; -} diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index 4fc5efc29..bb8ff307a 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -34,6 +34,47 @@ namespace cuco::detail { * Reference: * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230 * + * Example: + * @code{.cpp} + * template + * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector const& positive_keys, + * device_vector const& negative_keys) + * { + * // Arrow filter policy type + * using policy_type = cuco::arrow_filter_policy; + * + * // Create a bloom filter with arrow_filter_policy + * cuco::bloom_filter, + * cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS}; + * + * // Add positive keys to the bloom filter + * filter.add(positive_keys.begin(), positive_keys.end()); + * + * // Number of true positives and true negatives + * auto const num_tp = positive_keys.size(); + * auto const num_tn = negative_keys.size(); + * + * // Vectors to store true positive and true negative filter query results. + * thrust::device_vector true_positive_result(num_tp, false); + * thrust::device_vector true_negative_result(num_tn, false); + * + * // Query the bloom filter for the inserted positive keys. + * filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin()); + * + * // Query the bloom filter for the non-inserted true negative_keys. + * filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin()); + * + * // We should see a true-positive rate of 1. + * float true_positive_rate = float(thrust::count(thrust::device, + * true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp); + * + * // Since bloom filters are probalistic data structures, we may see a false-positive rate > 0 + * // depending on the number of bits in the filter and the number of hashes used per key. + * float false_positive_rate = float(thrust::count(thrust::device, + * true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn); + * } + * @endcode + * * @tparam Key The type of the values to generate a fingerprint for. */ template