From 2837a45e05f97f0a63c8ca45e57057c4d2a52ee2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 19 Dec 2024 19:08:21 -0800 Subject: [PATCH] Simplify expression transformer in Parquet predicate pushdown with `ast::tree` (#17587) This PR simplifies the StatsAST expression transformer in Parquet reader's predicate pushdown using `ast::tree` from (https://github.com/rapidsai/cudf/pull/17156). This PR is a follow up to @bdice's comment at https://github.com/rapidsai/cudf/pull/17289#discussion_r1876966752. Similar changes for the `BloomfilterAST` expression converter have been incorporated in the PR https://github.com/rapidsai/cudf/pull/17289. Related to https://github.com/rapidsai/cudf/issues/17164 Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17587 --- cpp/src/io/parquet/predicate_pushdown.cpp | 51 +++++++++++------------ 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b0cbabf1c12..9047ff9169b 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -265,7 +265,6 @@ class stats_expression_converter : public ast::detail::expression_transformer { */ std::reference_wrapper visit(ast::literal const& expr) override { - _stats_expr = std::reference_wrapper(expr); return expr; } @@ -278,7 +277,6 @@ class stats_expression_converter : public ast::detail::expression_transformer { "Statistics AST supports only left table"); CUDF_EXPECTS(expr.get_column_index() < _num_columns, "Column index cannot be more than number of columns in the table"); - _stats_expr = std::reference_wrapper(expr); return expr; } @@ -307,6 +305,9 @@ class stats_expression_converter : public ast::detail::expression_transformer { CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); + // Push literal into the ast::tree + auto const& literal = + _stats_expr.push(*dynamic_cast(&operands[1].get())); auto const col_index = v->get_column_index(); switch (op) { /* transform to stats conditions. op(col, literal) @@ -318,34 +319,33 @@ class stats_expression_converter : public ast::detail::expression_transformer { col1 <= val --> vmin <= val */ case ast_operator::EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - auto const& op1 = - _operators.emplace_back(ast_operator::LESS_EQUAL, vmin, operands[1].get()); - auto const& op2 = - _operators.emplace_back(ast_operator::GREATER_EQUAL, vmax, operands[1].get()); - _operators.emplace_back(ast::ast_operator::LOGICAL_AND, op1, op2); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); break; } case ast_operator::NOT_EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - auto const& op1 = _operators.emplace_back(ast_operator::NOT_EQUAL, vmin, vmax); - auto const& op2 = - _operators.emplace_back(ast_operator::NOT_EQUAL, vmax, operands[1].get()); - _operators.emplace_back(ast_operator::LOGICAL_OR, op1, op2); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); break; } case ast_operator::LESS: [[fallthrough]]; case ast_operator::LESS_EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - _operators.emplace_back(op, vmin, operands[1].get()); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); break; } case ast_operator::GREATER: [[fallthrough]]; case ast_operator::GREATER_EQUAL: { - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - _operators.emplace_back(op, vmax, operands[1].get()); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); break; } default: CUDF_FAIL("Unsupported operation in Statistics AST"); @@ -353,13 +353,12 @@ class stats_expression_converter : public ast::detail::expression_transformer { } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { - _operators.emplace_back(op, new_operands.front(), new_operands.back()); + _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _operators.emplace_back(op, new_operands.front()); + _stats_expr.push(ast::operation{op, new_operands.front()}); } } - _stats_expr = std::reference_wrapper(_operators.back()); - return std::reference_wrapper(_operators.back()); + return _stats_expr.back(); } /** @@ -369,7 +368,7 @@ class stats_expression_converter : public ast::detail::expression_transformer { */ [[nodiscard]] std::reference_wrapper get_stats_expr() const { - return _stats_expr.value().get(); + return _stats_expr.back(); } private: @@ -383,10 +382,8 @@ class stats_expression_converter : public ast::detail::expression_transformer { } return transformed_operands; } - std::optional> _stats_expr; + ast::tree _stats_expr; size_type _num_columns; - std::list _col_ref; - std::list _operators; }; } // namespace