From 8aaea5d6eb8e918bb876b3b2374e9f847087ce6d Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 15 Feb 2024 11:30:59 -0500 Subject: [PATCH] Dont call multiunzip when no stats (#9220) * Dont call multiunzip when no stats * Update docstring --- datafusion/core/benches/sql_planner.rs | 2 +- datafusion/core/src/datasource/listing/table.rs | 9 +++++++-- datafusion/core/src/datasource/statistics.rs | 10 ++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 4615d0a0f55c..6f54054530da 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -234,7 +234,7 @@ fn criterion_benchmark(c: &mut Criterion) { let sql = std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q)) .unwrap(); c.bench_function(&format!("physical_plan_tpch_{}", q), |b| { - b.iter(|| logical_plan(&ctx, &sql)) + b.iter(|| physical_plan(&ctx, &sql)) }); } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 094b26bfbd99..56e64f556c12 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -880,8 +880,13 @@ impl ListingTable { .boxed() .buffered(ctx.config_options().execution.meta_fetch_concurrency); - let (files, statistics) = - get_statistics_with_limit(files, self.schema(), limit).await?; + let (files, statistics) = get_statistics_with_limit( + files, + self.schema(), + limit, + self.options.collect_stat, + ) + .await?; Ok(( split_files(files, self.options.target_partitions), diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 73896f8eb7c1..c67227f966a2 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -29,12 +29,15 @@ use itertools::izip; use itertools::multiunzip; /// Get all files as well as the file level summary statistics (no statistic for partition columns). -/// If the optional `limit` is provided, includes only sufficient files. -/// Needed to read up to `limit` number of rows. +/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to +/// `limit` number of rows. `collect_stats` is passed down from the configuration parameter on +/// `ListingTable`. If it is false we only construct bare statistics and skip a potentially expensive +/// call to `multiunzip` for constructing file level summary statistics. pub async fn get_statistics_with_limit( all_files: impl Stream>, file_schema: SchemaRef, limit: Option, + collect_stats: bool, ) -> Result<(Vec, Statistics)> { let mut result_files = vec![]; // These statistics can be calculated as long as at least one file provides @@ -78,6 +81,9 @@ pub async fn get_statistics_with_limit( while let Some(current) = all_files.next().await { let (file, file_stats) = current?; result_files.push(file); + if !collect_stats { + continue; + } // We accumulate the number of rows, total byte size and null // counts across all the files in question. If any file does not