Skip to content

Commit

Permalink
Fix minor details
Browse files Browse the repository at this point in the history
Signed-off-by: Yan Feng <[email protected]>
  • Loading branch information
ustcfy committed Nov 22, 2024
1 parent 90a4aae commit ccdd64b
Showing 1 changed file with 22 additions and 19 deletions.
41 changes: 22 additions & 19 deletions src/main/cpp/src/hive_hash.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ using hive_hash_value_t = int32_t;
constexpr hive_hash_value_t HIVE_HASH_FACTOR = 31;
constexpr hive_hash_value_t HIVE_INIT_HASH = 0;

constexpr int MAX_NESTED_DEPTH = 8;

hive_hash_value_t __device__ inline compute_int(int32_t key) { return key; }

hive_hash_value_t __device__ inline compute_long(int64_t key)
Expand Down Expand Up @@ -157,7 +159,7 @@ hive_hash_value_t __device__ inline hive_hash_function<cudf::timestamp_us>::oper
* @tparam hash_function Hash functor to use for hashing elements. Must be hive_hash_function.
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
*/
template <template <typename> class hash_function, typename Nullate, int MAX_NESTED_DEPTH>
template <template <typename> class hash_function, typename Nullate>
class hive_device_row_hasher {
public:
CUDF_HOST_DEVICE hive_device_row_hasher(Nullate check_nulls, cudf::table_device_view t) noexcept
Expand Down Expand Up @@ -221,8 +223,8 @@ class hive_device_row_hasher {
__device__ col_stack_frame() =
delete; // Because the default constructor of `cudf::column_device_view` is deleted

__device__ col_stack_frame(cudf::column_device_view element)
: _element(element), _idx_to_process(0), _cur_hash(HIVE_INIT_HASH)
__device__ col_stack_frame(cudf::column_device_view col)
: _column(col), _idx_to_process(0), _cur_hash(HIVE_INIT_HASH)
{
}

Expand All @@ -240,7 +242,6 @@ class hive_device_row_hasher {
__device__ cudf::column_device_view get_column() { return _column; }
};


/**
* @brief Functor to compute the hive hash value for a nested column.
*
Expand All @@ -255,8 +256,8 @@ class hive_device_row_hasher {
* return hash;
* }
*
* In the cases of lists, the hash value is computed by a similar way but we iterate through the list
* elements instead of through the child columns' elements.
* In the cases of lists, the hash value is computed by a similar way but we iterate through the
* list elements instead of through the child columns' elements.
*
* This functor uses a stack to simulate the recursive process of the above pseudocode.
* When an element is popped from the stack, it means that the hash value of it has been
Expand All @@ -271,7 +272,8 @@ class hive_device_row_hasher {
* i. If all child columns are processed, pop the element and update `cur_hash` of its
* parent column.
* ii. Otherwise, push the next child column into the stack.
* c. If the column is a lists column, process it by a similar way as structs column but iterating through the list elements instead of child columns' elements.
* c. If the column is a lists column, process it by a similar way as structs column but
* iterating through the list elements instead of child columns' elements.
* d. If the column is a primitive column, compute the hash value, pop the element,
* and update `cur_hash` of its parent column.
* 3. Return the hash value of the root column.
Expand All @@ -296,8 +298,10 @@ class hive_device_row_hasher {
* As lists columns have a different interface from structs columns, we need to handle them
* separately.
*
* For example, consider the following nested column: `List<List<int>>`
* [[1, 0], null, [2, null]]
* For example, consider that the input column is of type `List<List<int>>`.
* Assume that the element at `row_index` is: [[1, 0], null, [2, null]]
* Since the stack_frame should contain a column that consists of only one row, the input column
* should be sliced. The sliced column is noted as L1 here.
*
* L1
* |
Expand All @@ -323,14 +327,14 @@ class hive_device_row_hasher {
* null_mask: 101
*
* Int level i:
* |Index|Int |
* |Index|int |
* |-----|----|
* |0 |1 |
* |1 |0 |
* |2 |2 |
* |3 |null|
* length: 4
* null_mask: 0111
* null_mask: 1110
*
* Since the underlying data loses the null information of the top-level list column, computing
* hash values using the underlying data merely can yield different results compared to Spark.
Expand Down Expand Up @@ -371,13 +375,13 @@ class hive_device_row_hasher {
// Instead leverage the byte array to create the col_stack_frame array.
alignas(col_stack_frame) char stack_wrapper[sizeof(col_stack_frame) * MAX_NESTED_DEPTH];
auto col_stack = reinterpret_cast<col_stack_frame*>(stack_wrapper);
int stack_size = 0;
int stack_size = 0;

col_stack[stack_size++] = col_stack_frame(curr_col);

while (stack_size > 0) {
col_stack_frame& top = col_stack[stack_size - 1];
curr_col = top.get_element();
curr_col = top.get_column();
// Do not pop it until it is processed. The definition of `processed` is:
// - For structs, it is when all child columns are processed.
// - For lists, it is when all elements in the list are processed.
Expand Down Expand Up @@ -437,7 +441,7 @@ class hive_device_row_hasher {
cudf::table_device_view const _table;
};

void check_nested_depth(cudf::table_view const& input, int max_nested_depth)
void check_nested_depth(cudf::table_view const& input)
{
using column_checker_fn_t = std::function<int(cudf::column_view const&)>;

Expand All @@ -462,11 +466,11 @@ void check_nested_depth(cudf::table_view const& input, int max_nested_depth)

for (auto i = 0; i < input.num_columns(); i++) {
cudf::column_view const& col = input.column(i);
CUDF_EXPECTS(get_nested_depth(col) <= max_nested_depth,
CUDF_EXPECTS(get_nested_depth(col) <= MAX_NESTED_DEPTH,
"The " + std::to_string(i) +
"-th column exceeds the maximum allowed nested depth. " +
"Current depth: " + std::to_string(get_nested_depth(col)) + ", " +
"Maximum allowed depth: " + std::to_string(max_nested_depth));
"Maximum allowed depth: " + std::to_string(MAX_NESTED_DEPTH));
}
}

Expand All @@ -485,8 +489,7 @@ std::unique_ptr<cudf::column> hive_hash(cudf::table_view const& input,
// Return early if there's nothing to hash
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

constexpr int MAX_NESTED_DEPTH = 8;
check_nested_depth(input, max_nested_depth);
check_nested_depth(input);

bool const nullable = has_nested_nulls(input);
auto const input_view = cudf::table_device_view::create(input, stream);
Expand All @@ -497,7 +500,7 @@ std::unique_ptr<cudf::column> hive_hash(cudf::table_view const& input,
rmm::exec_policy(stream),
output_view.begin<hive_hash_value_t>(),
output_view.end<hive_hash_value_t>(),
hive_device_row_hasher<hive_hash_function, bool, max_nested_depth>(nullable, *input_view));
hive_device_row_hasher<hive_hash_function, bool>(nullable, *input_view));

return output;
}
Expand Down

0 comments on commit ccdd64b

Please sign in to comment.