|
38 | 38 | #endif
|
39 | 39 |
|
40 | 40 | #include <tbb/parallel_for.h>
|
| 41 | +#include <tbb/parallel_reduce.h> |
41 | 42 |
|
42 | 43 | #include <future>
|
43 | 44 | #endif
|
@@ -263,9 +264,11 @@ DEVICE auto fill_hash_join_buff_impl(int32_t* buff,
|
263 | 264 | #endif
|
264 | 265 | JoinColumnTyped col{&join_column, &type_info};
|
265 | 266 | for (auto item : col.slice(start, step)) {
|
| 267 | + // LOG(ERROR) << "items: " << item.index; |
266 | 268 | const size_t index = item.index;
|
267 | 269 | int64_t elem = item.element;
|
268 | 270 | if (elem == type_info.null_val) {
|
| 271 | + // LOG(ERROR) << "null val"; |
269 | 272 | if (type_info.uses_bw_eq) {
|
270 | 273 | elem = type_info.translated_null_val;
|
271 | 274 | } else {
|
@@ -323,6 +326,267 @@ DEVICE int SUFFIX(fill_hash_join_buff_bucketized)(
|
323 | 326 | hashtable_filling_func);
|
324 | 327 | }
|
325 | 328 |
|
| 329 | +#ifndef __CUDACC__ |
| 330 | + |
| 331 | +namespace { |
| 332 | + |
| 333 | +template <ColumnType T> |
| 334 | +inline int64_t getElem(const int8_t* chunk_mem_ptr, size_t elem_size, size_t elem_ind) { |
| 335 | + UNREACHABLE(); |
| 336 | + return 0; |
| 337 | +}; |
| 338 | + |
| 339 | +template <> |
| 340 | +inline int64_t getElem<ColumnType::SmallDate>(const int8_t* chunk_mem_ptr, |
| 341 | + size_t elem_size, |
| 342 | + size_t elem_ind) { |
| 343 | + return fixed_width_small_date_decode_noinline(chunk_mem_ptr, |
| 344 | + elem_size, |
| 345 | + elem_size == 4 ? NULL_INT : NULL_SMALLINT, |
| 346 | + elem_size == 4 ? NULL_INT : NULL_SMALLINT, |
| 347 | + elem_ind); |
| 348 | +} |
| 349 | + |
| 350 | +template <> |
| 351 | +inline int64_t getElem<ColumnType::Signed>(const int8_t* chunk_mem_ptr, |
| 352 | + size_t elem_size, |
| 353 | + size_t elem_ind) { |
| 354 | + return fixed_width_int_decode_noinline(chunk_mem_ptr, elem_size, elem_ind); |
| 355 | +} |
| 356 | + |
| 357 | +template <> |
| 358 | +inline int64_t getElem<ColumnType::Unsigned>(const int8_t* chunk_mem_ptr, |
| 359 | + size_t elem_size, |
| 360 | + size_t elem_ind) { |
| 361 | + return fixed_width_unsigned_decode_noinline(chunk_mem_ptr, elem_size, elem_ind); |
| 362 | +} |
| 363 | + |
| 364 | +template <> |
| 365 | +inline int64_t getElem<ColumnType::Double>(const int8_t* chunk_mem_ptr, |
| 366 | + size_t elem_size, |
| 367 | + size_t elem_ind) { |
| 368 | + return fixed_width_double_decode_noinline(chunk_mem_ptr, elem_ind); |
| 369 | +} |
| 370 | + |
| 371 | +template <ColumnType T, size_t Elem> |
| 372 | +inline int64_t getElem(const int8_t* chunk_mem_ptr, size_t elem_ind) { |
| 373 | + return getElem<T>(chunk_mem_ptr, Elem, elem_ind); |
| 374 | +} |
| 375 | + |
| 376 | +template <typename HASHTABLE_FILLING_FUNC, ColumnType T, size_t Elem> |
| 377 | +inline int raw_func_impl(const tbb::blocked_range<size_t>& elems_range, |
| 378 | + const int8_t* chunk_mem_ptr, |
| 379 | + size_t curr_chunk_row_offset, |
| 380 | + const JoinColumnTypeInfo& type_info, |
| 381 | + const int32_t* sd_inner_to_outer_translation_map, |
| 382 | + const int32_t min_inner_elem, |
| 383 | + HASHTABLE_FILLING_FUNC hashtable_filling_func) { |
| 384 | + // DEBUG_TIMER("fill_hash_join_buff_bucketized_cpu raw_func"); |
| 385 | + // INJECT_TIMER(raw_func); |
| 386 | + // LOG(ERROR) << " num_elems threaded: " << elems_range.size(); |
| 387 | + for (size_t elem_i = elems_range.begin(); elem_i != elems_range.end(); elem_i++) { |
| 388 | + int64_t elem = getElem<T, Elem>(chunk_mem_ptr, elem_i); |
| 389 | + |
| 390 | + if (elem == type_info.null_val) { |
| 391 | + if (!type_info.uses_bw_eq) { |
| 392 | + continue; |
| 393 | + } |
| 394 | + elem = type_info.translated_null_val; |
| 395 | + } |
| 396 | + |
| 397 | + if (sd_inner_to_outer_translation_map && |
| 398 | + (!type_info.uses_bw_eq || elem != type_info.translated_null_val)) { |
| 399 | + const auto outer_id = map_str_id_to_outer_dict(elem, |
| 400 | + min_inner_elem, |
| 401 | + type_info.min_val, |
| 402 | + type_info.max_val, |
| 403 | + sd_inner_to_outer_translation_map); |
| 404 | + if (outer_id == StringDictionary::INVALID_STR_ID) { |
| 405 | + continue; |
| 406 | + } |
| 407 | + elem = outer_id; |
| 408 | + } |
| 409 | + |
| 410 | + if (hashtable_filling_func(elem, curr_chunk_row_offset + elem_i)) { |
| 411 | + return -1; |
| 412 | + } |
| 413 | + } |
| 414 | + return 0; |
| 415 | +} |
| 416 | + |
| 417 | +template <typename HASHTABLE_FILLING_FUNC, ColumnType T> |
| 418 | +inline int raw_func(const tbb::blocked_range<size_t>& elems_range, |
| 419 | + const int8_t* chunk_mem_ptr, |
| 420 | + size_t curr_chunk_row_offset, |
| 421 | + const JoinColumnTypeInfo& type_info, |
| 422 | + const int32_t* sd_inner_to_outer_translation_map, |
| 423 | + const int32_t min_inner_elem, |
| 424 | + HASHTABLE_FILLING_FUNC hashtable_filling_func) { |
| 425 | + switch (type_info.elem_sz) { |
| 426 | + case 1: |
| 427 | + return raw_func_impl<HASHTABLE_FILLING_FUNC, T, 1>( |
| 428 | + elems_range, |
| 429 | + chunk_mem_ptr, |
| 430 | + curr_chunk_row_offset, |
| 431 | + type_info, |
| 432 | + sd_inner_to_outer_translation_map, |
| 433 | + min_inner_elem, |
| 434 | + hashtable_filling_func); |
| 435 | + case 2: |
| 436 | + return raw_func_impl<HASHTABLE_FILLING_FUNC, T, 2>( |
| 437 | + elems_range, |
| 438 | + chunk_mem_ptr, |
| 439 | + curr_chunk_row_offset, |
| 440 | + type_info, |
| 441 | + sd_inner_to_outer_translation_map, |
| 442 | + min_inner_elem, |
| 443 | + hashtable_filling_func); |
| 444 | + case 4: |
| 445 | + return raw_func_impl<HASHTABLE_FILLING_FUNC, T, 4>( |
| 446 | + elems_range, |
| 447 | + chunk_mem_ptr, |
| 448 | + curr_chunk_row_offset, |
| 449 | + type_info, |
| 450 | + sd_inner_to_outer_translation_map, |
| 451 | + min_inner_elem, |
| 452 | + hashtable_filling_func); |
| 453 | + case 8: |
| 454 | + return raw_func_impl<HASHTABLE_FILLING_FUNC, T, 8>( |
| 455 | + elems_range, |
| 456 | + chunk_mem_ptr, |
| 457 | + curr_chunk_row_offset, |
| 458 | + type_info, |
| 459 | + sd_inner_to_outer_translation_map, |
| 460 | + min_inner_elem, |
| 461 | + hashtable_filling_func); |
| 462 | + default: |
| 463 | + break; |
| 464 | + } |
| 465 | + UNREACHABLE(); |
| 466 | + return 0; |
| 467 | +} |
| 468 | + |
| 469 | +template <typename HASHTABLE_FILLING_FUNC> |
| 470 | +inline int raw_func(const tbb::blocked_range<size_t>& elems_range, |
| 471 | + const int8_t* chunk_mem_ptr, |
| 472 | + size_t curr_chunk_row_offset, |
| 473 | + const JoinColumnTypeInfo& type_info, |
| 474 | + const int32_t* sd_inner_to_outer_translation_map, |
| 475 | + const int32_t min_inner_elem, |
| 476 | + HASHTABLE_FILLING_FUNC hashtable_filling_func) { |
| 477 | + switch (type_info.column_type) { |
| 478 | + case SmallDate: |
| 479 | + return raw_func<HASHTABLE_FILLING_FUNC, SmallDate>( |
| 480 | + elems_range, |
| 481 | + chunk_mem_ptr, |
| 482 | + curr_chunk_row_offset, |
| 483 | + type_info, |
| 484 | + sd_inner_to_outer_translation_map, |
| 485 | + min_inner_elem, |
| 486 | + hashtable_filling_func); |
| 487 | + case Signed: |
| 488 | + return raw_func<HASHTABLE_FILLING_FUNC, Signed>(elems_range, |
| 489 | + chunk_mem_ptr, |
| 490 | + curr_chunk_row_offset, |
| 491 | + type_info, |
| 492 | + sd_inner_to_outer_translation_map, |
| 493 | + min_inner_elem, |
| 494 | + hashtable_filling_func); |
| 495 | + case Unsigned: |
| 496 | + return raw_func<HASHTABLE_FILLING_FUNC, Unsigned>(elems_range, |
| 497 | + chunk_mem_ptr, |
| 498 | + curr_chunk_row_offset, |
| 499 | + type_info, |
| 500 | + sd_inner_to_outer_translation_map, |
| 501 | + min_inner_elem, |
| 502 | + hashtable_filling_func); |
| 503 | + case Double: |
| 504 | + return raw_func<HASHTABLE_FILLING_FUNC, Double>(elems_range, |
| 505 | + chunk_mem_ptr, |
| 506 | + curr_chunk_row_offset, |
| 507 | + type_info, |
| 508 | + sd_inner_to_outer_translation_map, |
| 509 | + min_inner_elem, |
| 510 | + hashtable_filling_func); |
| 511 | + default: |
| 512 | + break; |
| 513 | + } |
| 514 | + UNREACHABLE(); |
| 515 | + return 0; |
| 516 | +} |
| 517 | + |
| 518 | +} // namespace |
| 519 | + |
| 520 | +DEVICE int SUFFIX(fill_hash_join_buff_bucketized_cpu)( |
| 521 | + int32_t* cpu_hash_table_buff, |
| 522 | + const int32_t hash_join_invalid_val, |
| 523 | + const bool for_semi_join, |
| 524 | + const JoinColumn& join_column, |
| 525 | + const JoinColumnTypeInfo& type_info, |
| 526 | + const int32_t* sd_inner_to_outer_translation_map, |
| 527 | + const int32_t min_inner_elem, |
| 528 | + const int64_t bucket_normalization) { |
| 529 | + auto filling_func = for_semi_join ? SUFFIX(fill_hashtable_for_semi_join) |
| 530 | + : SUFFIX(fill_one_to_one_hashtable); |
| 531 | + auto hashtable_filling_func = [&](int64_t elem, size_t index) { |
| 532 | + auto entry_ptr = SUFFIX(get_bucketized_hash_slot)( |
| 533 | + cpu_hash_table_buff, elem, type_info.min_val, bucket_normalization); |
| 534 | + return filling_func(index, entry_ptr, hash_join_invalid_val); |
| 535 | + }; |
| 536 | + |
| 537 | + // for some stupid reason int8* ptr is actually JoinChunk* Why? |
| 538 | + auto join_chunk_array = |
| 539 | + reinterpret_cast<const struct JoinChunk*>(join_column.col_chunks_buff); |
| 540 | + // BTW it's vector with sz: |
| 541 | + // join_column.num_chunks |
| 542 | + // const int8_t* chunk_mem_ptr = join_chunk_array->col_buff; |
| 543 | + |
| 544 | + // wtf 1 chunk, but 0 elements. |
| 545 | + if (join_column.num_elems == 0) { |
| 546 | + return 0; |
| 547 | + } |
| 548 | + |
| 549 | + // This value is tuned to make range of elemnts |
| 550 | + // handled in each thread spend about 10ms according to timers. |
| 551 | + size_t data_to_handle_sz = 512 * 1024; |
| 552 | + size_t granularity = data_to_handle_sz / type_info.elem_sz; |
| 553 | + |
| 554 | + std::atomic<int> err{0}; |
| 555 | + // LOG(ERROR) << "Num chunks: " << join_column.num_chunks; |
| 556 | + tbb::parallel_for( |
| 557 | + tbb::blocked_range<size_t>(0, join_column.num_chunks), |
| 558 | + [&](const tbb::blocked_range<size_t>& join_chunks_range) { |
| 559 | + DEBUG_TIMER("fill_hash_join_buff_bucketized_cpu chunk"); |
| 560 | + for (size_t chunk_i = join_chunks_range.begin(); |
| 561 | + chunk_i != join_chunks_range.end(); |
| 562 | + chunk_i++) { |
| 563 | + auto curr_chunk = join_chunk_array[chunk_i]; |
| 564 | + // LOG(ERROR) << " num elems: " << curr_chunk.num_elems; |
| 565 | + |
| 566 | + tbb::parallel_for( |
| 567 | + tbb::blocked_range<size_t>(0, curr_chunk.num_elems, granularity), |
| 568 | + [&](const tbb::blocked_range<size_t>& curr_chnunk_elems_range) { |
| 569 | + auto ret = raw_func(curr_chnunk_elems_range, |
| 570 | + curr_chunk.col_buff, |
| 571 | + curr_chunk.row_id, |
| 572 | + type_info, |
| 573 | + sd_inner_to_outer_translation_map, |
| 574 | + min_inner_elem, |
| 575 | + hashtable_filling_func); |
| 576 | + if (ret != 0) { |
| 577 | + int zero{0}; |
| 578 | + err.compare_exchange_strong(zero, ret); |
| 579 | + } |
| 580 | + }); |
| 581 | + } |
| 582 | + }); |
| 583 | + if (err) { |
| 584 | + return -1; |
| 585 | + } |
| 586 | + return 0; |
| 587 | +} |
| 588 | +#endif |
| 589 | + |
326 | 590 | DEVICE int SUFFIX(fill_hash_join_buff)(int32_t* buff,
|
327 | 591 | const int32_t invalid_slot_val,
|
328 | 592 | const bool for_semi_join,
|
|
0 commit comments