From b7a443a16420a5fb400297d05ee59c2122c9de6f Mon Sep 17 00:00:00 2001 From: kamille Date: Sun, 1 Sep 2024 22:19:01 +0800 Subject: [PATCH] improve docs. --- .../physical-plan/src/aggregates/row_hash.rs | 57 +++++++------------ 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 91325fc9bb79..edb15ae3cb81 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -274,28 +274,6 @@ impl SkipAggregationProbe { /// The accumulator state is not managed by this operator (e.g in the /// hash table). /// -/// An important optimization for [`group_values`] and [`accumulators`] -/// is to manage values using the blocked approach. -/// -/// In the original method, values are managed within a single large block -/// (can think of it as a Vec). As this block grows, it often triggers numerous -/// copies, resulting in poor performance. -/// -/// In contrast, the blocked approach allocates capacity for the block -/// based on a predefined block size firstly. -/// And when the block reaches its limit, we allocate a new block -/// (also with the same predefined block size based capacity) -/// instead of expanding the current one and copying the data. -/// This method eliminates unnecessary copies and significantly improves performance. -/// For a nice introduction to the blocked approach, maybe you can see [#7065]. -/// -/// The conditions that trigger the blocked mode can be found in -/// [`maybe_enable_blocked_group_states`]. -/// -/// [`group_values`]: Self::group_values -/// [`accumulators`]: Self::accumulators -/// [#7065]: https://github.com/apache/datafusion/issues/7065 -/// /// # Partial Aggregate and multi-phase grouping /// /// As described on [`Accumulator::state`], this operator is used in the context @@ -364,24 +342,29 @@ impl SkipAggregationProbe { /// │ 2 │ 2 │ 3.0 │ │ 2 │ 2 │ 3.0 │ └────────────┘ /// └─────────────────┘ └─────────────────┘ /// ``` +/// +/// # Blocked approach for intermediate values +/// An important optimization for [`group_values`] and [`accumulators`] +/// is to manage values using the blocked approach. /// -/// # Partial Aggregate and multi-phase grouping -/// -/// As described on [`Accumulator::state`], this operator is used in the context -/// "multi-phase" grouping when the mode is [`AggregateMode::Partial`]. -/// -/// An important optimization for multi-phase partial aggregation is to skip -/// partial aggregation when it is not effective enough to warrant the memory or -/// CPU cost, as is often the case for queries many distinct groups (high -/// cardinality group by). Memory is particularly important because each Partial -/// aggregator must store the intermediate state for each group. +/// In the original method, values are managed within a single large block +/// (can think of it as a Vec). As this block grows, it often triggers numerous +/// copies, resulting in poor performance. /// -/// If the ratio of the number of groups to the number of input rows exceeds a -/// threshold, and [`GroupsAccumulator::supports_convert_to_state`] is -/// supported, this operator will stop applying Partial aggregation and directly -/// pass the input rows to the next aggregation phase. +/// In contrast, the blocked approach allocates capacity for the block +/// based on a predefined block size firstly. +/// And when the block reaches its limit, we allocate a new block +/// (also with the same predefined block size based capacity) +/// instead of expanding the current one and copying the data. +/// This method eliminates unnecessary copies and significantly improves performance. +/// For a nice introduction to the blocked approach, maybe you can see [#7065]. /// -/// [`Accumulator::state`]: datafusion_expr::Accumulator::state +/// The conditions that trigger the blocked mode can be found in +/// [`maybe_enable_blocked_group_states`]. +/// +/// [`group_values`]: Self::group_values +/// [`accumulators`]: Self::accumulators +/// [#7065]: https://github.com/apache/datafusion/issues/7065 pub(crate) struct GroupedHashAggregateStream { // ======================================================================== // PROPERTIES: