From a9ec65aa3b2871a695ea6b52b8929552d09b2304 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 14 Nov 2023 15:32:30 -0500 Subject: [PATCH 1/3] Minor: add apply_filter to Precision --- datafusion/common/src/stats.rs | 7 ++++++ datafusion/physical-plan/src/filter.rs | 31 ++++++-------------------- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index 1c7a4fd4d553..d04365385f6d 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -151,6 +151,13 @@ impl Precision { (_, _) => Precision::Absent, } } + + /// Return the estimate of applying a filter of selectivity `selectivity` to + /// this Precision. A selectivity of `1.0` means that all rows are selected. + /// A selectivity of `0.5` means half the rows are selected. + pub fn apply_filter(self, selectivity: f64) -> Self { + self.map(|v| ((v as f64 * selectivity).ceil()) as usize) + } } impl Precision { diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 822ddfdf3eb0..acfbf37ce0af 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -200,15 +200,10 @@ impl ExecutionPlan for FilterExec { // assume filter selects 20% of rows if we cannot do anything smarter // tracking issue for making this configurable: // https://github.com/apache/arrow-datafusion/issues/8133 - let selectivity = 0.2_f32; + let selectivity = 0.2_f64; let mut stats = input_stats.clone().into_inexact(); - if let Precision::Inexact(n) = stats.num_rows { - stats.num_rows = Precision::Inexact((selectivity * n as f32) as usize); - } - if let Precision::Inexact(n) = stats.total_byte_size { - stats.total_byte_size = - Precision::Inexact((selectivity * n as f32) as usize); - } + stats.num_rows = stats.num_rows.apply_filter(selectivity); + stats.total_byte_size = stats.total_byte_size.apply_filter(selectivity); return Ok(stats); } @@ -222,14 +217,8 @@ impl ExecutionPlan for FilterExec { // Estimate (inexact) selectivity of predicate let selectivity = analysis_ctx.selectivity.unwrap_or(1.0); - let num_rows = match num_rows.get_value() { - Some(nr) => Precision::Inexact((*nr as f64 * selectivity).ceil() as usize), - None => Precision::Absent, - }; - let total_byte_size = match total_byte_size.get_value() { - Some(tbs) => Precision::Inexact((*tbs as f64 * selectivity).ceil() as usize), - None => Precision::Absent, - }; + let num_rows = num_rows.apply_filter(selectivity); + let total_byte_size = total_byte_size.apply_filter(selectivity); let column_statistics = collect_new_statistics( &input_stats.column_statistics, @@ -277,16 +266,10 @@ fn collect_new_statistics( ) }; ColumnStatistics { - null_count: match input_column_stats[idx].null_count.get_value() { - Some(nc) => Precision::Inexact(*nc), - None => Precision::Absent, - }, + null_count: input_column_stats[idx].null_count.clone().to_inexact(), max_value, min_value, - distinct_count: match distinct_count.get_value() { - Some(dc) => Precision::Inexact(*dc), - None => Precision::Absent, - }, + distinct_count: distinct_count.to_inexact(), } }, ) From 6f04551546ebe1869c8595c0469d350f2d7d1b15 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 14 Nov 2023 16:23:20 -0500 Subject: [PATCH 2/3] fix: use inexact --- datafusion/common/src/stats.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index d04365385f6d..e00ec9703056 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -152,11 +152,13 @@ impl Precision { } } - /// Return the estimate of applying a filter of selectivity `selectivity` to - /// this Precision. A selectivity of `1.0` means that all rows are selected. - /// A selectivity of `0.5` means half the rows are selected. + /// Return the estimate of applying a filter with estimated selectivity + /// `selectivity` to this Precision. A selectivity of `1.0` means that all + /// rows are selected. A selectivity of `0.5` means half the rows are + /// selected. Will always return inexact statistics. pub fn apply_filter(self, selectivity: f64) -> Self { self.map(|v| ((v as f64 * selectivity).ceil()) as usize) + .to_inexact() } } From 434dbbf0d8afcc130b8cf5a82e6a831ae73cada8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 16 Nov 2023 09:11:49 -0500 Subject: [PATCH 3/3] Rename to with_estimated_selectivity --- datafusion/common/src/stats.rs | 2 +- datafusion/physical-plan/src/filter.rs | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index e00ec9703056..7ad8992ca9ae 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -156,7 +156,7 @@ impl Precision { /// `selectivity` to this Precision. A selectivity of `1.0` means that all /// rows are selected. A selectivity of `0.5` means half the rows are /// selected. Will always return inexact statistics. - pub fn apply_filter(self, selectivity: f64) -> Self { + pub fn with_estimated_selectivity(self, selectivity: f64) -> Self { self.map(|v| ((v as f64 * selectivity).ceil()) as usize) .to_inexact() } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index acfbf37ce0af..107c95eff7f1 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -202,8 +202,10 @@ impl ExecutionPlan for FilterExec { // https://github.com/apache/arrow-datafusion/issues/8133 let selectivity = 0.2_f64; let mut stats = input_stats.clone().into_inexact(); - stats.num_rows = stats.num_rows.apply_filter(selectivity); - stats.total_byte_size = stats.total_byte_size.apply_filter(selectivity); + stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity); + stats.total_byte_size = stats + .total_byte_size + .with_estimated_selectivity(selectivity); return Ok(stats); } @@ -217,8 +219,8 @@ impl ExecutionPlan for FilterExec { // Estimate (inexact) selectivity of predicate let selectivity = analysis_ctx.selectivity.unwrap_or(1.0); - let num_rows = num_rows.apply_filter(selectivity); - let total_byte_size = total_byte_size.apply_filter(selectivity); + let num_rows = num_rows.with_estimated_selectivity(selectivity); + let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity); let column_statistics = collect_new_statistics( &input_stats.column_statistics,