Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor mixed_semi_join using cuco::static_set #16230

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
caea465
Refactor mixed_semi_join using cuco::static_set
srinivasyadav18 Jul 3, 2024
913aed7
Merge branch 'branch-24.08' into mixed_semi_join_refactor
srinivasyadav18 Jul 11, 2024
be82efb
Swap build and probe table
srinivasyadav18 Jul 12, 2024
a8922c5
use size_type as key type for static_set
srinivasyadav18 Jul 12, 2024
dc7b7d1
move hash_probe to host
srinivasyadav18 Jul 12, 2024
5cc0c62
remove load factor for hash set
srinivasyadav18 Jul 12, 2024
6f73ed8
minor optimizations
srinivasyadav18 Jul 12, 2024
08ad9cb
Add cg tuning
srinivasyadav18 Jul 12, 2024
1e7054d
Fix formatting
srinivasyadav18 Jul 18, 2024
c9f3e29
Merge branch 'branch-24.08' into mixed_semi_join_refactor
srinivasyadav18 Jul 18, 2024
cd18674
Merge branch 'branch-24.08' into mixed_semi_join_refactor
srinivasyadav18 Aug 16, 2024
bd2d930
Merge remote-tracking branch 'origin/branch-24.10' into mixed_semi_jo…
srinivasyadav18 Aug 16, 2024
c7f960e
Revert "Ensure managed memory is supported in cudf.pandas. (#16552)"
srinivasyadav18 Aug 16, 2024
b62a786
Merge branch 'branch-24.10' into mixed-semi-join-refactor
mhaseeb123 Sep 5, 2024
8b10e65
Resolve merge conflicts
mhaseeb123 Sep 5, 2024
d49131d
Remove erroneous changelog
mhaseeb123 Sep 5, 2024
b8bf218
Address reviewer comments.
mhaseeb123 Sep 6, 2024
51021ae
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 6, 2024
c83e4a6
Re-add CHANGELOG.md
mhaseeb123 Sep 6, 2024
b5e8c79
Fix dependencies and styling
mhaseeb123 Sep 6, 2024
f6d8a8a
Remove magic number for cg size
mhaseeb123 Sep 6, 2024
8359ec7
Minor refactoring
mhaseeb123 Sep 6, 2024
8e9ab66
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 6, 2024
18d652f
Merge branch 'mixed_semi_join_refactor' of https://github.com/sriniva…
mhaseeb123 Sep 6, 2024
e458d39
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 9, 2024
ee0e606
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 9, 2024
ce2cb57
Fix for Java tests and replicate the test in C++
mhaseeb123 Sep 10, 2024
47c0938
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 10, 2024
08410dd
Port a previously-failing anti-join test from java to C++
mhaseeb123 Sep 10, 2024
bdc13e2
Merge branch 'mixed_semi_join_refactor' of https://github.com/sriniva…
mhaseeb123 Sep 10, 2024
d860e86
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 10, 2024
964bf6a
Minor changes.
mhaseeb123 Sep 10, 2024
1ba2bad
Revert the stale files
mhaseeb123 Sep 10, 2024
7b640d6
Apply suggestion from codde review
mhaseeb123 Sep 16, 2024
0e88b41
Merge branch 'branch-24.10' into mixed_semi_join_refactor
mhaseeb123 Sep 16, 2024
9189c2c
Minor style fix
mhaseeb123 Sep 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix for Java tests and replicate the test in C++
Signed-off-by: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
  • Loading branch information
mhaseeb123 committed Sep 10, 2024
commit ce2cb5740e7e689ac622b4b6c6055dff385d3cf3
4 changes: 3 additions & 1 deletion cpp/src/join/mixed_join_kernels_semi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
left_table, right_table, device_expression_data);

if (outer_row_index < outer_num_rows) {
// Make sure to swap_tables here as hash_set will use probe table as the left one.
auto constexpr swap_tables = true;
// Figure out the number of elements for this key.
auto equality = single_expression_equality<has_nulls>{
evaluator, thread_intermediate_storage, false, equality_probe};
evaluator, thread_intermediate_storage, swap_tables, equality_probe};

auto const set_ref_equality = set_ref.with_key_eq(equality);
auto const result = set_ref_equality.contains(tile, outer_row_index);
Expand Down
19 changes: 8 additions & 11 deletions cpp/src/join/mixed_join_semi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -151,20 +151,16 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
preprocessed_build_condtional};
auto const equality_build_conditional =
row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
double_row_equality_comparator equality_build{equality_build_equality,
equality_build_conditional};

auto const build_num_rows = compute_hash_table_size(build.num_rows());

hash_set_type row_set{
build_num_rows,
{compute_hash_table_size(build.num_rows())},
cuco::empty_key{JoinNoneValue},
equality_build,
{equality_build_equality, equality_build_conditional},
{row_hash_build.device_hasher(build_nulls)},
{},
{},
cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
stream.value()};
{stream.value()}};

auto iter = thrust::make_counting_iterator(0);

Expand All @@ -183,12 +179,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(

detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
auto const shmem_size_per_block =
(parser.shmem_per_thread / hash_set_type::cg_size) * config.num_threads_per_block;
parser.shmem_per_thread *
cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);

auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
auto const hash_probe = row_hash.device_hasher(has_nulls);

hash_set_ref_type const row_set_ref =
row_set.ref(cuco::contains).with_hash_function(row_hash.device_hasher(has_nulls));
hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);

// Vector used to indicate indices from left/probe table which are present in output
auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
Expand Down
15 changes: 15 additions & 0 deletions cpp/tests/join/mixed_join_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
{1});
}

TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
{
auto const col_ref_left_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
auto left_one_greater_right_one =
cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);

this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
{{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
{0},
{1},
left_one_greater_right_one,
{2, 7, 8});
}

TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
{
this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
Expand Down
Loading