From 11542740a982d3aba36f43d93967a58bd9ab8d9b Mon Sep 17 00:00:00 2001 From: jokercurry <982458633@qq.com> Date: Tue, 12 Dec 2023 21:23:08 +0800 Subject: [PATCH 01/22] Add `today` alias for `current_date` (#8423) * fix conflict * add md * add test * add test * addr comments * Update datafusion/sqllogictest/test_files/timestamps.slt Co-authored-by: Alex Huang --------- Co-authored-by: zhongjingxiong Co-authored-by: Alex Huang --- datafusion/expr/src/built_in_function.rs | 2 +- .../sqllogictest/test_files/timestamps.slt | 24 +++++++++++++++++++ .../source/user-guide/sql/scalar_functions.md | 9 +++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 5a903a73adc6..fd899289ac82 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -1532,7 +1532,7 @@ impl BuiltinScalarFunction { // time/date functions BuiltinScalarFunction::Now => &["now"], - BuiltinScalarFunction::CurrentDate => &["current_date"], + BuiltinScalarFunction::CurrentDate => &["current_date", "today"], BuiltinScalarFunction::CurrentTime => &["current_time"], BuiltinScalarFunction::DateBin => &["date_bin"], BuiltinScalarFunction::DateTrunc => &["date_trunc", "datetrunc"], diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 71b6ddf33f39..f956d59b1da0 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -46,6 +46,30 @@ statement ok create table ts_data_secs as select arrow_cast(ts / 1000000000, 'Timestamp(Second, None)') as ts, value from ts_data; +########## +## Current date Tests +########## + +query B +select cast(now() as date) = current_date(); +---- +true + +query B +select now() = current_date(); +---- +false + +query B +select current_date() = today(); +---- +true + +query B +select cast(now() as date) = today(); +---- +true + ########## ## Timestamp Handling Tests diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 9a9bec9df77b..ad4c6ed083bf 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1280,6 +1280,7 @@ regexp_replace(str, regexp, replacement, flags) - [datepart](#datepart) - [extract](#extract) - [to_timestamp](#to_timestamp) +- [today](#today) - [to_timestamp_millis](#to_timestamp_millis) - [to_timestamp_micros](#to_timestamp_micros) - [to_timestamp_seconds](#to_timestamp_seconds) @@ -1308,6 +1309,14 @@ no matter when in the query plan the function executes. current_date() ``` +#### Aliases + +- today + +### `today` + +_Alias of [current_date](#current_date)._ + ### `current_time` Returns the current UTC time. From 2102275f64012ec6eff4dd9c72eb87cc76921d74 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Tue, 12 Dec 2023 14:30:29 +0100 Subject: [PATCH 02/22] remove useless clone (#8495) --- datafusion/physical-expr/src/array_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index c2dc88b10773..7fa97dad7aa6 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -2138,7 +2138,7 @@ pub fn general_array_distinct( let mut offsets = Vec::with_capacity(array.len()); offsets.push(OffsetSize::usize_as(0)); let mut new_arrays = Vec::with_capacity(array.len()); - let converter = RowConverter::new(vec![SortField::new(dt.clone())])?; + let converter = RowConverter::new(vec![SortField::new(dt)])?; // distinct for each list in ListArray for arr in array.iter().flatten() { let values = converter.convert_columns(&[arr])?; From 7f312c8a1d82b19f03c640e4a5d7d6437b2ee835 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 12 Dec 2023 21:36:23 +0800 Subject: [PATCH 03/22] fix: incorrect set preserve_partitioning in SortExec (#8485) * fix: incorrect set preserve_partitioning in SortExec * add more comment --- .../physical_optimizer/projection_pushdown.rs | 3 +- datafusion/sqllogictest/test_files/join.slt | 37 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 67a2eaf0d9b3..664afbe822ff 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -454,7 +454,8 @@ fn try_swapping_with_sort( Ok(Some(Arc::new( SortExec::new(updated_exprs, make_with_child(projection, sort.input())?) - .with_fetch(sort.fetch()), + .with_fetch(sort.fetch()) + .with_preserve_partitioning(sort.preserve_partitioning()), ))) } diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 386ffe766b19..c9dd7ca604ad 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -594,3 +594,40 @@ drop table IF EXISTS full_join_test; # batch size statement ok set datafusion.execution.batch_size = 8192; + +# related to: https://github.com/apache/arrow-datafusion/issues/8374 +statement ok +CREATE TABLE t1(a text, b int) AS VALUES ('Alice', 50), ('Alice', 100); + +statement ok +CREATE TABLE t2(a text, b int) AS VALUES ('Alice', 2), ('Alice', 1); + +# the current query results are incorrect, becuase the query was incorrectly rewritten as: +# SELECT t1.a, t1.b FROM t1 JOIN t2 ON t1.a = t2.a ORDER BY t1.a, t1.b; +# the difference is ORDER BY clause rewrite from t2.b to t1.b, it is incorrect. +# after https://github.com/apache/arrow-datafusion/issues/8374 fixed, the correct result should be: +# Alice 50 +# Alice 100 +# Alice 50 +# Alice 100 +query TI +SELECT t1.a, t1.b FROM t1 JOIN t2 ON t1.a = t2.a ORDER BY t1.a, t2.b; +---- +Alice 50 +Alice 50 +Alice 100 +Alice 100 + +query TITI +SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a ORDER BY t1.a, t2.b; +---- +Alice 50 Alice 1 +Alice 100 Alice 1 +Alice 50 Alice 2 +Alice 100 Alice 2 + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; From 2919e32b8ce9e594f83a0a1268cc13a121a7963c Mon Sep 17 00:00:00 2001 From: Dennis Liu Date: Wed, 13 Dec 2023 05:01:07 +0800 Subject: [PATCH 04/22] Explicitly mark parquet for tests in datafusion-common (#8497) * Add cfg for test and reference requiring parquet * Check datafusion-common can be compiled without parquet * Update rust.yml * Remove unnecessary space --- .github/workflows/rust.yml | 3 +++ datafusion/common/src/file_options/file_type.rs | 1 + datafusion/common/src/file_options/mod.rs | 8 ++++++++ datafusion/common/src/test_util.rs | 1 + 4 files changed, 13 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 099aab061435..a541091e3a2b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -68,6 +68,9 @@ jobs: - name: Check workspace without default features run: cargo check --no-default-features -p datafusion + - name: Check datafusion-common without default features + run: cargo check --tests --no-default-features -p datafusion-common + - name: Check workspace in debug mode run: cargo check diff --git a/datafusion/common/src/file_options/file_type.rs b/datafusion/common/src/file_options/file_type.rs index a07f2e0cb847..b1d61b1a2567 100644 --- a/datafusion/common/src/file_options/file_type.rs +++ b/datafusion/common/src/file_options/file_type.rs @@ -109,6 +109,7 @@ mod tests { use std::str::FromStr; #[test] + #[cfg(feature = "parquet")] fn from_str() { for (ext, file_type) in [ ("csv", FileType::CSV), diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index b7c1341e3046..f0e49dd85597 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -299,6 +299,7 @@ impl Display for FileTypeWriterOptions { mod tests { use std::collections::HashMap; + #[cfg(feature = "parquet")] use parquet::{ basic::{Compression, Encoding, ZstdLevel}, file::properties::{EnabledStatistics, WriterVersion}, @@ -313,9 +314,11 @@ mod tests { use crate::Result; + #[cfg(feature = "parquet")] use super::{parquet_writer::ParquetWriterOptions, StatementOptions}; #[test] + #[cfg(feature = "parquet")] fn test_writeroptions_parquet_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); option_map.insert("max_row_group_size".to_owned(), "123".to_owned()); @@ -386,6 +389,7 @@ mod tests { } #[test] + #[cfg(feature = "parquet")] fn test_writeroptions_parquet_column_specific() -> Result<()> { let mut option_map: HashMap = HashMap::new(); @@ -506,6 +510,8 @@ mod tests { } #[test] + // for StatementOptions + #[cfg(feature = "parquet")] fn test_writeroptions_csv_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); option_map.insert("header".to_owned(), "true".to_owned()); @@ -533,6 +539,8 @@ mod tests { } #[test] + // for StatementOptions + #[cfg(feature = "parquet")] fn test_writeroptions_json_from_statement_options() -> Result<()> { let mut option_map: HashMap = HashMap::new(); option_map.insert("compression".to_owned(), "gzip".to_owned()); diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 9a4433782157..eeace97eebfa 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -285,6 +285,7 @@ mod tests { } #[test] + #[cfg(feature = "parquet")] fn test_happy() { let res = arrow_test_data(); assert!(PathBuf::from(res).is_dir()); From 861cc36eb66243079e0171637e65adccd5956c94 Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Tue, 12 Dec 2023 16:02:53 -0500 Subject: [PATCH 05/22] Minor/Doc: Clarify DataFrame::write_table Documentation (#8519) * update write_table doc * fmt --- datafusion/core/src/dataframe/mod.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index c40dd522a457..3e286110f7f9 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1013,11 +1013,16 @@ impl DataFrame { )) } - /// Write this DataFrame to the referenced table + /// Write this DataFrame to the referenced table by name. /// This method uses on the same underlying implementation - /// as the SQL Insert Into statement. - /// Unlike most other DataFrame methods, this method executes - /// eagerly, writing data, and returning the count of rows written. + /// as the SQL Insert Into statement. Unlike most other DataFrame methods, + /// this method executes eagerly. Data is written to the table using an + /// execution plan returned by the [TableProvider]'s insert_into method. + /// Refer to the documentation of the specific [TableProvider] to determine + /// the expected data returned by the insert_into plan via this method. + /// For the built in ListingTable provider, a single [RecordBatch] containing + /// a single column and row representing the count of total rows written + /// is returned. pub async fn write_table( self, table_name: &str, From 500ab40888a855bd021c302af9b27319a98ebc54 Mon Sep 17 00:00:00 2001 From: Vaibhav Rabber Date: Wed, 13 Dec 2023 03:37:53 +0530 Subject: [PATCH 06/22] fix: Pull stats in `IdentVisitor`/`GraphvizVisitor` only when requested (#8514) Signed-off-by: Vaibhav --- datafusion/physical-plan/src/display.rs | 127 +++++++++++++++++++++++- 1 file changed, 125 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 612e164be0e2..19c2847b09dc 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -260,8 +260,8 @@ impl<'a, 'b> ExecutionPlanVisitor for IndentVisitor<'a, 'b> { } } } - let stats = plan.statistics().map_err(|_e| fmt::Error)?; if self.show_statistics { + let stats = plan.statistics().map_err(|_e| fmt::Error)?; write!(self.f, ", statistics=[{}]", stats)?; } writeln!(self.f)?; @@ -341,8 +341,8 @@ impl ExecutionPlanVisitor for GraphvizVisitor<'_, '_> { } }; - let stats = plan.statistics().map_err(|_e| fmt::Error)?; let statistics = if self.show_statistics { + let stats = plan.statistics().map_err(|_e| fmt::Error)?; format!("statistics=[{}]", stats) } else { "".to_string() @@ -436,3 +436,126 @@ impl<'a> fmt::Display for OutputOrderingDisplay<'a> { write!(f, "]") } } + +#[cfg(test)] +mod tests { + use std::fmt::Write; + use std::sync::Arc; + + use datafusion_common::DataFusionError; + + use crate::{DisplayAs, ExecutionPlan}; + + use super::DisplayableExecutionPlan; + + #[derive(Debug, Clone, Copy)] + enum TestStatsExecPlan { + Panic, + Error, + Ok, + } + + impl DisplayAs for TestStatsExecPlan { + fn fmt_as( + &self, + _t: crate::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "TestStatsExecPlan") + } + } + + impl ExecutionPlan for TestStatsExecPlan { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> arrow_schema::SchemaRef { + Arc::new(arrow_schema::Schema::empty()) + } + + fn output_partitioning(&self) -> datafusion_physical_expr::Partitioning { + datafusion_physical_expr::Partitioning::UnknownPartitioning(1) + } + + fn output_ordering( + &self, + ) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { + None + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> datafusion_common::Result> { + unimplemented!() + } + + fn execute( + &self, + _: usize, + _: Arc, + ) -> datafusion_common::Result + { + todo!() + } + + fn statistics(&self) -> datafusion_common::Result { + match self { + Self::Panic => panic!("expected panic"), + Self::Error => { + Err(DataFusionError::Internal("expected error".to_string())) + } + Self::Ok => Ok(datafusion_common::Statistics::new_unknown( + self.schema().as_ref(), + )), + } + } + } + + fn test_stats_display(exec: TestStatsExecPlan, show_stats: bool) { + let display = + DisplayableExecutionPlan::new(&exec).set_show_statistics(show_stats); + + let mut buf = String::new(); + write!(&mut buf, "{}", display.one_line()).unwrap(); + let buf = buf.trim(); + assert_eq!(buf, "TestStatsExecPlan"); + } + + #[test] + fn test_display_when_stats_panic_with_no_show_stats() { + test_stats_display(TestStatsExecPlan::Panic, false); + } + + #[test] + fn test_display_when_stats_error_with_no_show_stats() { + test_stats_display(TestStatsExecPlan::Error, false); + } + + #[test] + fn test_display_when_stats_ok_with_no_show_stats() { + test_stats_display(TestStatsExecPlan::Ok, false); + } + + #[test] + #[should_panic(expected = "expected panic")] + fn test_display_when_stats_panic_with_show_stats() { + test_stats_display(TestStatsExecPlan::Panic, true); + } + + #[test] + #[should_panic(expected = "Error")] // fmt::Error + fn test_display_when_stats_error_with_show_stats() { + test_stats_display(TestStatsExecPlan::Error, true); + } + + #[test] + fn test_display_when_stats_ok_with_show_stats() { + test_stats_display(TestStatsExecPlan::Ok, false); + } +} From 4578f3daeefd84305d3f055c243827856e2c036c Mon Sep 17 00:00:00 2001 From: Jacob Ogle <123908271+JacobOgle@users.noreply.github.com> Date: Wed, 13 Dec 2023 09:11:29 -0500 Subject: [PATCH 07/22] Change display of RepartitionExec from SortPreservingRepartitionExec to RepartitionExec preserve_order=true (#8521) * Change display of RepartitionExec from SortPreservingRepartitionExec to RepartitionExec preserve_order=true #8129 * fix mod.rs with cargo fmt * test fix for repartition::test::test_preserve_order --- .../enforce_distribution.rs | 38 +++++++++---------- .../src/physical_optimizer/enforce_sorting.rs | 6 +-- .../replace_with_order_preserving_variants.rs | 22 +++++------ .../physical-plan/src/repartition/mod.rs | 12 +++--- .../sqllogictest/test_files/groupby.slt | 2 +- datafusion/sqllogictest/test_files/joins.slt | 2 +- datafusion/sqllogictest/test_files/window.slt | 10 ++--- 7 files changed, 46 insertions(+), 46 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 3aed6555f305..93cdbf858367 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1129,8 +1129,8 @@ fn replace_order_preserving_variants( /// Assume that following plan is given: /// ```text /// "SortPreservingMergeExec: \[a@0 ASC]" -/// " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10", -/// " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2", +/// " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true", +/// " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true", /// " ParquetExec: file_groups={2 groups: \[\[x], \[y]]}, projection=\[a, b, c, d, e], output_ordering=\[a@0 ASC]", /// ``` /// @@ -3015,16 +3015,16 @@ pub(crate) mod tests { vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortPreservingRepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, sort_exprs=a@0 ASC", + "RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[a@0 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, sort_exprs=b1@1 ASC", + "RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b1@1 ASC]", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=c@2 ASC", + "RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[c@2 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", @@ -3041,21 +3041,21 @@ pub(crate) mod tests { _ => vec![ top_join_plan.as_str(), // Below 4 operators are differences introduced, when join mode is changed - "SortPreservingRepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, sort_exprs=a@0 ASC", + "RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[a@0 ASC]", "CoalescePartitionsExec", join_plan.as_str(), - "SortPreservingRepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, sort_exprs=a@0 ASC", + "RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[a@0 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, sort_exprs=b1@1 ASC", + "RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b1@1 ASC]", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=c@2 ASC", + "RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[c@2 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", @@ -3129,16 +3129,16 @@ pub(crate) mod tests { JoinType::Inner | JoinType::Right => vec![ top_join_plan.as_str(), join_plan.as_str(), - "SortPreservingRepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, sort_exprs=a@0 ASC", + "RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[a@0 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, sort_exprs=b1@1 ASC", + "RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b1@1 ASC]", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=c@2 ASC", + "RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[c@2 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", @@ -3146,21 +3146,21 @@ pub(crate) mod tests { // Should include 8 RepartitionExecs (4 of them preserves order) and 4 SortExecs JoinType::Left | JoinType::Full => vec![ top_join_plan.as_str(), - "SortPreservingRepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, sort_exprs=b1@6 ASC", + "RepartitionExec: partitioning=Hash([b1@6], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@6 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b1@6 ASC]", "CoalescePartitionsExec", join_plan.as_str(), - "SortPreservingRepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, sort_exprs=a@0 ASC", + "RepartitionExec: partitioning=Hash([a@0], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[a@0 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, sort_exprs=b1@1 ASC", + "RepartitionExec: partitioning=Hash([b1@1], 10), input_partitions=10, preserve_order=true, sort_exprs=b1@1 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b1@1 ASC]", "ProjectionExec: expr=[a@0 as a1, b@1 as b1, c@2 as c1, d@3 as d1, e@4 as e1]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=c@2 ASC", + "RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=c@2 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[c@2 ASC]", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", @@ -3251,7 +3251,7 @@ pub(crate) mod tests { let expected_first_sort_enforcement = &[ "SortMergeJoin: join_type=Inner, on=[(b3@1, b2@1), (a3@0, a2@0)]", - "SortPreservingRepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, sort_exprs=b3@1 ASC,a3@0 ASC", + "RepartitionExec: partitioning=Hash([b3@1, a3@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b3@1 ASC,a3@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b3@1 ASC,a3@0 ASC]", "CoalescePartitionsExec", @@ -3262,7 +3262,7 @@ pub(crate) mod tests { "AggregateExec: mode=Partial, gby=[b@1 as b1, a@0 as a1], aggr=[]", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]", - "SortPreservingRepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, sort_exprs=b2@1 ASC,a2@0 ASC", + "RepartitionExec: partitioning=Hash([b2@1, a2@0], 10), input_partitions=10, preserve_order=true, sort_exprs=b2@1 ASC,a2@0 ASC", "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", "SortExec: expr=[b2@1 ASC,a2@0 ASC]", "CoalescePartitionsExec", @@ -4347,7 +4347,7 @@ pub(crate) mod tests { let expected = &[ "SortPreservingMergeExec: [c@2 ASC]", "FilterExec: c@2 = 0", - "SortPreservingRepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, sort_exprs=c@2 ASC", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC", "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]", ]; diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 14715ede500a..277404b301c4 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -2162,7 +2162,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC]", - " SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=a@0 ASC", + " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC], has_header=false", ]; @@ -2191,7 +2191,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC]", - " SortPreservingRepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, sort_exprs=a@0 ASC", + " RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC], has_header=false", ]; @@ -2263,7 +2263,7 @@ mod tests { let expected_input = [ "BoundedWindowAggExec: wdw=[count: Ok(Field { name: \"count\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(NULL), end_bound: CurrentRow }], mode=[Sorted]", " SortPreservingMergeExec: [a@0 ASC,b@1 ASC]", - " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, sort_exprs=a@0 ASC,b@1 ASC", + " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC,b@1 ASC", " RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1", " SortExec: expr=[a@0 ASC,b@1 ASC]", " CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false", diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs index 09274938cbce..af45df7d8474 100644 --- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs @@ -366,7 +366,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", ]; @@ -414,10 +414,10 @@ mod tests { let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC]", " FilterExec: c@1 > 3", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " SortPreservingMergeExec: [a@0 ASC]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC], has_header=true", ]; @@ -448,7 +448,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " FilterExec: c@1 > 3", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", @@ -484,7 +484,7 @@ mod tests { "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=8192", " FilterExec: c@1 > 3", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", ]; @@ -522,7 +522,7 @@ mod tests { "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", " CoalesceBatchesExec: target_batch_size=8192", " FilterExec: c@1 > 3", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " CoalesceBatchesExec: target_batch_size=8192", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", @@ -591,10 +591,10 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " CoalesceBatchesExec: target_batch_size=8192", " FilterExec: c@1 > 3", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", ]; @@ -658,7 +658,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST], has_header=true", ]; @@ -706,7 +706,7 @@ mod tests { let expected_optimized = [ "SortPreservingMergeExec: [c@1 ASC]", " FilterExec: c@1 > 3", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=c@1 ASC", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=c@1 ASC", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " SortExec: expr=[c@1 ASC]", " CoalescePartitionsExec", @@ -800,7 +800,7 @@ mod tests { ]; let expected_optimized = [ "SortPreservingMergeExec: [a@0 ASC NULLS LAST]", - " SortPreservingRepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, sort_exprs=a@0 ASC NULLS LAST", + " RepartitionExec: partitioning=Hash([c@1], 8), input_partitions=8, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST", " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1", " CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, c, d], output_ordering=[a@0 ASC NULLS LAST], has_header=true", ]; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 24f227d8a535..769dc5e0e197 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -370,11 +370,7 @@ impl RepartitionExec { /// Get name used to display this Exec pub fn name(&self) -> &str { - if self.preserve_order { - "SortPreservingRepartitionExec" - } else { - "RepartitionExec" - } + "RepartitionExec" } } @@ -394,6 +390,10 @@ impl DisplayAs for RepartitionExec { self.input.output_partitioning().partition_count() )?; + if self.preserve_order { + write!(f, ", preserve_order=true")?; + } + if let Some(sort_exprs) = self.sort_exprs() { write!( f, @@ -1491,7 +1491,7 @@ mod test { // Repartition should preserve order let expected_plan = [ - "SortPreservingRepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, sort_exprs=c0@0 ASC", + "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c0@0 ASC", " UnionExec", " MemoryExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", " MemoryExec: partitions=1, partition_sizes=[0], output_ordering=c0@0 ASC", diff --git a/datafusion/sqllogictest/test_files/groupby.slt b/datafusion/sqllogictest/test_files/groupby.slt index b7be4d78b583..8ed5245ef09b 100644 --- a/datafusion/sqllogictest/test_files/groupby.slt +++ b/datafusion/sqllogictest/test_files/groupby.slt @@ -4073,7 +4073,7 @@ GlobalLimitExec: skip=0, fetch=5 ----ProjectionExec: expr=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as time_chunks] ------AggregateExec: mode=FinalPartitioned, gby=[date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted --------CoalesceBatchesExec: target_batch_size=2 -----------SortPreservingRepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC +----------RepartitionExec: partitioning=Hash([date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0], 8), input_partitions=8, preserve_order=true, sort_exprs=date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)@0 DESC ------------AggregateExec: mode=Partial, gby=[date_bin(900000000000, ts@0) as date_bin(Utf8("15 minutes"),unbounded_csv_with_timestamps.ts)], aggr=[], ordering_mode=Sorted --------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 ----------------StreamingTableExec: partition_sizes=1, projection=[ts], infinite_source=true, output_ordering=[ts@0 DESC] diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 0fea8da5a342..67e3750113da 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3464,7 +3464,7 @@ SortPreservingMergeExec: [a@0 ASC] ----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 ------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true ------------------CoalesceBatchesExec: target_batch_size=2 ---------------------SortPreservingRepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2, sort_exprs=a@0 ASC,b@1 ASC NULLS LAST +--------------------RepartitionExec: partitioning=Hash([a@0], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC,b@1 ASC NULLS LAST ----------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 ------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC, b@1 ASC NULLS LAST], has_header=true diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index f3de5b54fc8b..7b628f9b6f14 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3245,17 +3245,17 @@ physical_plan ProjectionExec: expr=[SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@2 as sum1, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as sum2, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as sum3, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as sum4] --BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Linear] ----CoalesceBatchesExec: target_batch_size=4096 -------SortPreservingRepartitionExec: partitioning=Hash([d@1], 2), input_partitions=2, sort_exprs=a@0 ASC NULLS LAST +------RepartitionExec: partitioning=Hash([d@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST --------ProjectionExec: expr=[a@0 as a, d@3 as d, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@4 as SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@5 as SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@6 as SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW] ----------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted] ------------CoalesceBatchesExec: target_batch_size=4096 ---------------SortPreservingRepartitionExec: partitioning=Hash([b@1, a@0], 2), input_partitions=2, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST +--------------RepartitionExec: partitioning=Hash([b@1, a@0], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST ----------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[PartiallySorted([0])] ------------------CoalesceBatchesExec: target_batch_size=4096 ---------------------SortPreservingRepartitionExec: partitioning=Hash([a@0, d@3], 2), input_partitions=2, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST +--------------------RepartitionExec: partitioning=Hash([a@0, d@3], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST ----------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.a) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(NULL)), end_bound: CurrentRow }], mode=[Sorted] ------------------------CoalesceBatchesExec: target_batch_size=4096 ---------------------------SortPreservingRepartitionExec: partitioning=Hash([a@0, b@1], 2), input_partitions=2, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST +--------------------------RepartitionExec: partitioning=Hash([a@0, b@1], 2), input_partitions=2, preserve_order=true, sort_exprs=a@0 ASC NULLS LAST,b@1 ASC NULLS LAST,c@2 ASC NULLS LAST ----------------------------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 ------------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST] @@ -3571,7 +3571,7 @@ SortPreservingMergeExec: [c@3 ASC NULLS LAST] --ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, AVG(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW@5 as avg_d] ----BoundedWindowAggExec: wdw=[AVG(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW: Ok(Field { name: "AVG(multiple_ordered_table_inf.d) PARTITION BY [multiple_ordered_table_inf.d] ORDER BY [multiple_ordered_table_inf.a ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: CurrentRow }], mode=[Linear] ------CoalesceBatchesExec: target_batch_size=4096 ---------SortPreservingRepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, sort_exprs=a@1 ASC NULLS LAST,b@2 ASC NULLS LAST +--------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST,b@2 ASC NULLS LAST ----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 ------------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST] From 2bc67ef5a9f1e9393e3fa7396ee3a53fa08ca415 Mon Sep 17 00:00:00 2001 From: Asura7969 <1402357969@qq.com> Date: Thu, 14 Dec 2023 02:54:55 +0800 Subject: [PATCH 08/22] Fix `DataFrame::cache` errors with `Plan("Mismatch between schema and batches")` (#8510) * type cast * add test * use physical plan * logic optimization --- datafusion/core/src/dataframe/mod.rs | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 3e286110f7f9..4b8a9c5b7d79 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1276,11 +1276,12 @@ impl DataFrame { /// ``` pub async fn cache(self) -> Result { let context = SessionContext::new_with_state(self.session_state.clone()); - let mem_table = MemTable::try_new( - SchemaRef::from(self.schema().clone()), - self.collect_partitioned().await?, - )?; - + // The schema is consistent with the output + let plan = self.clone().create_physical_plan().await?; + let schema = plan.schema(); + let task_ctx = Arc::new(self.task_ctx()); + let partitions = collect_partitioned(plan, task_ctx).await?; + let mem_table = MemTable::try_new(schema, partitions)?; context.read_table(Arc::new(mem_table)) } } @@ -2638,6 +2639,17 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_cache_mismatch() -> Result<()> { + let ctx = SessionContext::new(); + let df = ctx + .sql("SELECT CASE WHEN true THEN NULL ELSE 1 END") + .await?; + let cache_df = df.cache().await; + assert!(cache_df.is_ok()); + Ok(()) + } + #[tokio::test] async fn cache_test() -> Result<()> { let df = test_table() From 0678a6978528d0e2e1c85a1018b68ed974eb45cf Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Dec 2023 13:55:13 -0500 Subject: [PATCH 09/22] Minor: update pbjson_dependency (#8470) --- datafusion/proto/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 4dda689fff4c..fbd412aedaa5 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -47,7 +47,7 @@ datafusion = { path = "../core", version = "33.0.0" } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } object_store = { workspace = true } -pbjson = { version = "0.5", optional = true } +pbjson = { version = "0.6.0", optional = true } prost = "0.12.0" serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } From 2e93f079461fc3603df07af94ce616a5317af370 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Dec 2023 13:55:43 -0500 Subject: [PATCH 10/22] Minor: Update prost-derive dependency (#8471) --- datafusion-examples/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 676b4aaa78c0..59580bcb6a05 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -48,7 +48,7 @@ mimalloc = { version = "0.1", default-features = false } num_cpus = { workspace = true } object_store = { workspace = true, features = ["aws", "http"] } prost = { version = "0.12", default-features = false } -prost-derive = { version = "0.11", default-features = false } +prost-derive = { version = "0.12", default-features = false } serde = { version = "1.0.136", features = ["derive"] } serde_json = { workspace = true } tempfile = { workspace = true } From 9a322c8bdd5d62630936ac348bb935207675b99d Mon Sep 17 00:00:00 2001 From: Devin D'Angelo Date: Wed, 13 Dec 2023 15:39:37 -0500 Subject: [PATCH 11/22] Add write_table to dataframe actions in user guide (#8527) --- docs/source/user-guide/dataframe.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/dataframe.md b/docs/source/user-guide/dataframe.md index 4484b2c51019..c0210200a246 100644 --- a/docs/source/user-guide/dataframe.md +++ b/docs/source/user-guide/dataframe.md @@ -95,6 +95,7 @@ These methods execute the logical plan represented by the DataFrame and either c | write_csv | Execute this DataFrame and write the results to disk in CSV format. | | write_json | Execute this DataFrame and write the results to disk in JSON format. | | write_parquet | Execute this DataFrame and write the results to disk in Parquet format. | +| write_table | Execute this DataFrame and write the results via the insert_into method of the registered TableProvider | ## Other DataFrame Methods From 5bf80d655d545622d0272bc8e4bcf89dacaacf36 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Dec 2023 16:07:01 -0500 Subject: [PATCH 12/22] Minor: Add repartition_file.slt end to end test for repartitioning files, and supporting tweaks (#8505) * Minor: Add repartition_file.slt end to end test for repartitioning files, and supporting tweaks * Sort files, update tests --- .../core/src/datasource/listing/helpers.rs | 8 +- datafusion/core/src/datasource/listing/mod.rs | 5 + datafusion/core/src/datasource/listing/url.rs | 4 + datafusion/sqllogictest/bin/sqllogictests.rs | 2 + .../test_files/repartition_scan.slt | 268 ++++++++++++++++++ 5 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 datafusion/sqllogictest/test_files/repartition_scan.slt diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index 3536c098bd76..be74afa1f4d6 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -141,12 +141,18 @@ const CONCURRENCY_LIMIT: usize = 100; /// Partition the list of files into `n` groups pub fn split_files( - partitioned_files: Vec, + mut partitioned_files: Vec, n: usize, ) -> Vec> { if partitioned_files.is_empty() { return vec![]; } + + // ObjectStore::list does not guarantee any consistent order and for some + // implementations such as LocalFileSystem, it may be inconsistent. Thus + // Sort files by path to ensure consistent plans when run more than once. + partitioned_files.sort_by(|a, b| a.path().cmp(b.path())); + // effectively this is div with rounding up instead of truncating let chunk_size = (partitioned_files.len() + n - 1) / n; partitioned_files diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index 87c1663ae718..5e5b96f6ba8c 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -109,6 +109,11 @@ impl PartitionedFile { let size = std::fs::metadata(path.clone())?.len(); Ok(Self::new(path, size)) } + + /// Return the path of this partitioned file + pub fn path(&self) -> &Path { + &self.object_meta.location + } } impl From for PartitionedFile { diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 9e9fb9210071..979ed9e975c4 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -131,6 +131,10 @@ impl ListingTableUrl { if is_directory { fs::create_dir_all(path)?; } else { + // ensure parent directory exists + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } fs::File::create(path)?; } } diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index 618e3106c629..484677d58e79 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -159,6 +159,7 @@ async fn run_test_file_with_postgres(test_file: TestFile) -> Result<()> { relative_path, } = test_file; info!("Running with Postgres runner: {}", path.display()); + setup_scratch_dir(&relative_path)?; let mut runner = sqllogictest::Runner::new(|| Postgres::connect(relative_path.clone())); runner.with_column_validator(strict_column_validator); @@ -188,6 +189,7 @@ async fn run_complete_file(test_file: TestFile) -> Result<()> { info!("Skipping: {}", path.display()); return Ok(()); }; + setup_scratch_dir(&relative_path)?; let mut runner = sqllogictest::Runner::new(|| async { Ok(DataFusion::new( test_ctx.session_ctx().clone(), diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt new file mode 100644 index 000000000000..551d6d9ed48a --- /dev/null +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -0,0 +1,268 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for automatically reading files in parallel during scan +########## + +# Set 4 partitions for deterministic output plans +statement ok +set datafusion.execution.target_partitions = 4; + +# automatically partition all files over 1 byte +statement ok +set datafusion.optimizer.repartition_file_min_size = 1; + +################### +### Parquet tests +################### + +# create a single parquet file +# Note filename 2.parquet to test sorting (on local file systems it is often listed before 1.parquet) +statement ok +COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/parquet_table/2.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + +statement ok +CREATE EXTERNAL TABLE parquet_table(column1 int) +STORED AS PARQUET +LOCATION 'test_files/scratch/repartition_scan/parquet_table/'; + +query I +select * from parquet_table; +---- +1 +2 +3 +4 +5 + +## Expect to see the scan read the file as "4" groups with even sizes (offsets) +query TT +EXPLAIN SELECT column1 FROM parquet_table WHERE column1 <> 42; +---- +logical_plan +Filter: parquet_table.column1 != Int32(42) +--TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: column1@0 != 42 +----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..101], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:101..202], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:202..303], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:303..403]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1 + +# create a second parquet file +statement ok +COPY (VALUES (100), (200)) TO 'test_files/scratch/repartition_scan/parquet_table/1.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + +## Still expect to see the scan read the file as "4" groups with even sizes. One group should read +## parts of both files. +query TT +EXPLAIN SELECT column1 FROM parquet_table WHERE column1 <> 42 ORDER BY column1; +---- +logical_plan +Sort: parquet_table.column1 ASC NULLS LAST +--Filter: parquet_table.column1 != Int32(42) +----TableScan: parquet_table projection=[column1], partial_filters=[parquet_table.column1 != Int32(42)] +physical_plan +SortPreservingMergeExec: [column1@0 ASC NULLS LAST] +--SortExec: expr=[column1@0 ASC NULLS LAST] +----CoalesceBatchesExec: target_batch_size=8192 +------FilterExec: column1@0 != 42 +--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..200], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:200..394, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..206], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:206..403]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1 + + +## Read the files as though they are ordered + +statement ok +CREATE EXTERNAL TABLE parquet_table_with_order(column1 int) +STORED AS PARQUET +LOCATION 'test_files/scratch/repartition_scan/parquet_table' +WITH ORDER (column1 ASC); + +# output should be ordered +query I +SELECT column1 FROM parquet_table_with_order WHERE column1 <> 42 ORDER BY column1; +---- +1 +2 +3 +4 +5 +100 +200 + +# explain should not have any groups with more than one file +# https://github.com/apache/arrow-datafusion/issues/8451 +query TT +EXPLAIN SELECT column1 FROM parquet_table_with_order WHERE column1 <> 42 ORDER BY column1; +---- +logical_plan +Sort: parquet_table_with_order.column1 ASC NULLS LAST +--Filter: parquet_table_with_order.column1 != Int32(42) +----TableScan: parquet_table_with_order projection=[column1], partial_filters=[parquet_table_with_order.column1 != Int32(42)] +physical_plan +SortPreservingMergeExec: [column1@0 ASC NULLS LAST] +--CoalesceBatchesExec: target_batch_size=8192 +----FilterExec: column1@0 != 42 +------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..200], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:200..394, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..206], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:206..403]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1 + +# Cleanup +statement ok +DROP TABLE parquet_table; + +statement ok +DROP TABLE parquet_table_with_order; + + +################### +### CSV tests +################### + +# Since parquet and CSV share most of the same implementation, this test checks +# that the basics are connected properly + +# create a single csv file +statement ok +COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/csv_table/1.csv' +(FORMAT csv, SINGLE_FILE_OUTPUT true, HEADER true); + +statement ok +CREATE EXTERNAL TABLE csv_table(column1 int) +STORED AS csv +WITH HEADER ROW +LOCATION 'test_files/scratch/repartition_scan/csv_table/'; + +query I +select * from csv_table; +---- +1 +2 +3 +4 +5 + +## Expect to see the scan read the file as "4" groups with even sizes (offsets) +query TT +EXPLAIN SELECT column1 FROM csv_table WHERE column1 <> 42; +---- +logical_plan +Filter: csv_table.column1 != Int32(42) +--TableScan: csv_table projection=[column1], partial_filters=[csv_table.column1 != Int32(42)] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: column1@0 != 42 +----CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:5..10], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:10..15], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/csv_table/1.csv:15..18]]}, projection=[column1], has_header=true + +# Cleanup +statement ok +DROP TABLE csv_table; + + +################### +### JSON tests +################### + +# Since parquet and json share most of the same implementation, this test checks +# that the basics are connected properly + +# create a single json file +statement ok +COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/json_table/1.json' +(FORMAT json, SINGLE_FILE_OUTPUT true); + +statement ok +CREATE EXTERNAL TABLE json_table(column1 int) +STORED AS json +LOCATION 'test_files/scratch/repartition_scan/json_table/'; + +query I +select * from json_table; +---- +1 +2 +3 +4 +5 + +## In the future it would be cool to see the file read as "4" groups with even sizes (offsets) +## but for now it is just one group +## https://github.com/apache/arrow-datafusion/issues/8502 +query TT +EXPLAIN SELECT column1 FROM json_table WHERE column1 <> 42; +---- +logical_plan +Filter: json_table.column1 != Int32(42) +--TableScan: json_table projection=[column1], partial_filters=[json_table.column1 != Int32(42)] +physical_plan +CoalesceBatchesExec: target_batch_size=8192 +--FilterExec: column1@0 != 42 +----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +------JsonExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/json_table/1.json]]}, projection=[column1] + + +# Cleanup +statement ok +DROP TABLE json_table; + + +################### +### Arrow File tests +################### + +## Use pre-existing files we don't have a way to create arrow files yet +## (https://github.com/apache/arrow-datafusion/issues/8504) +statement ok +CREATE EXTERNAL TABLE arrow_table +STORED AS ARROW +LOCATION '../core/tests/data/example.arrow'; + + +# It would be great to see the file read as "4" groups with even sizes (offsets) eventually +# https://github.com/apache/arrow-datafusion/issues/8503 +query TT +EXPLAIN SELECT * FROM arrow_table +---- +logical_plan TableScan: arrow_table projection=[f0, f1, f2] +physical_plan ArrowExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.arrow]]}, projection=[f0, f1, f2] + +# Cleanup +statement ok +DROP TABLE arrow_table; + +################### +### Avro File tests +################### + +## Use pre-existing files we don't have a way to create avro files yet + +statement ok +CREATE EXTERNAL TABLE avro_table +STORED AS AVRO +WITH HEADER ROW +LOCATION '../../testing/data/avro/simple_enum.avro' + + +# It would be great to see the file read as "4" groups with even sizes (offsets) eventually +query TT +EXPLAIN SELECT * FROM avro_table +---- +logical_plan TableScan: avro_table projection=[f1, f2, f3] +physical_plan AvroExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/avro/simple_enum.avro]]}, projection=[f1, f2, f3] + +# Cleanup +statement ok +DROP TABLE avro_table; From 898911b9952679a163247b88b2a79c47be2e226c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 13 Dec 2023 14:22:24 -0700 Subject: [PATCH 13/22] Prepare version 34.0.0 (#8508) * bump version * changelog * changelog * Revert change --- Cargo.toml | 24 +-- benchmarks/Cargo.toml | 8 +- datafusion-cli/Cargo.lock | 102 ++++++------ datafusion-cli/Cargo.toml | 4 +- datafusion/CHANGELOG.md | 1 + datafusion/core/Cargo.toml | 6 +- datafusion/optimizer/Cargo.toml | 4 +- datafusion/proto/Cargo.toml | 2 +- datafusion/sqllogictest/Cargo.toml | 2 +- dev/changelog/34.0.0.md | 247 +++++++++++++++++++++++++++++ docs/Cargo.toml | 2 +- docs/source/user-guide/configs.md | 2 +- 12 files changed, 326 insertions(+), 78 deletions(-) create mode 100644 dev/changelog/34.0.0.md diff --git a/Cargo.toml b/Cargo.toml index 2bcbe059ab25..023dc6c6fc4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/arrow-datafusion" rust-version = "1.70" -version = "33.0.0" +version = "34.0.0" [workspace.dependencies] arrow = { version = "49.0.0", features = ["prettyprint"] } @@ -59,17 +59,17 @@ async-trait = "0.1.73" bigdecimal = "0.4.1" bytes = "1.4" ctor = "0.2.0" -datafusion = { path = "datafusion/core", version = "33.0.0" } -datafusion-common = { path = "datafusion/common", version = "33.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "33.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "33.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "33.0.0" } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "33.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "33.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "33.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "33.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "33.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "33.0.0" } +datafusion = { path = "datafusion/core", version = "34.0.0" } +datafusion-common = { path = "datafusion/common", version = "34.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "34.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "34.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "34.0.0" } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "34.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "34.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "34.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "34.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "34.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "34.0.0" } dashmap = "5.4.0" doc-comment = "0.3" env_logger = "0.10" diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index c5a24a0a5cf9..4ce46968e1f4 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-benchmarks" description = "DataFusion Benchmarks" -version = "33.0.0" +version = "34.0.0" edition = { workspace = true } authors = ["Apache Arrow "] homepage = "https://github.com/apache/arrow-datafusion" @@ -34,8 +34,8 @@ snmalloc = ["snmalloc-rs"] [dependencies] arrow = { workspace = true } -datafusion = { path = "../datafusion/core", version = "33.0.0" } -datafusion-common = { path = "../datafusion/common", version = "33.0.0" } +datafusion = { path = "../datafusion/core", version = "34.0.0" } +datafusion-common = { path = "../datafusion/common", version = "34.0.0" } env_logger = { workspace = true } futures = { workspace = true } log = { workspace = true } @@ -50,4 +50,4 @@ test-utils = { path = "../test-utils/", version = "0.1.0" } tokio = { version = "^1.0", features = ["macros", "rt", "rt-multi-thread", "parking_lot"] } [dev-dependencies] -datafusion-proto = { path = "../datafusion/proto", version = "33.0.0" } +datafusion-proto = { path = "../datafusion/proto", version = "34.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 76be04d5ef67..19ad6709362d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -384,7 +384,7 @@ checksum = "a66537f1bb974b254c98ed142ff995236e81b9d0fe4db0575f46612cb15eb0f9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1074,7 +1074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e366bff8cd32dd8754b0991fb66b279dc48f598c3a18914852a6673deef583" dependencies = [ "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1098,7 +1098,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "33.0.0" +version = "34.0.0" dependencies = [ "ahash", "apache-avro", @@ -1145,7 +1145,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "33.0.0" +version = "34.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1172,7 +1172,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "33.0.0" +version = "34.0.0" dependencies = [ "ahash", "apache-avro", @@ -1191,7 +1191,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "33.0.0" +version = "34.0.0" dependencies = [ "arrow", "chrono", @@ -1210,7 +1210,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "33.0.0" +version = "34.0.0" dependencies = [ "ahash", "arrow", @@ -1224,7 +1224,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "33.0.0" +version = "34.0.0" dependencies = [ "arrow", "async-trait", @@ -1240,7 +1240,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "33.0.0" +version = "34.0.0" dependencies = [ "ahash", "arrow", @@ -1272,7 +1272,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "33.0.0" +version = "34.0.0" dependencies = [ "ahash", "arrow", @@ -1301,7 +1301,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "33.0.0" +version = "34.0.0" dependencies = [ "arrow", "arrow-schema", @@ -1576,7 +1576,7 @@ checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -1752,9 +1752,9 @@ dependencies = [ [[package]] name = "http-body" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", "http", @@ -1827,7 +1827,7 @@ dependencies = [ "futures-util", "http", "hyper", - "rustls 0.21.9", + "rustls 0.21.10", "tokio", "tokio-rustls 0.24.1", ] @@ -1926,9 +1926,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" @@ -2020,9 +2020,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" [[package]] name = "libflate" @@ -2322,9 +2322,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "openssl-probe" @@ -2496,7 +2496,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -2736,7 +2736,7 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", - "rustls 0.21.9", + "rustls 0.21.10", "rustls-pemfile", "serde", "serde_json", @@ -2833,9 +2833,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.26" +version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" dependencies = [ "bitflags 2.4.1", "errno", @@ -2858,9 +2858,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.9" +version = "0.21.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ "log", "ring 0.17.7", @@ -2930,9 +2930,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "same-file" @@ -3020,7 +3020,7 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3196,7 +3196,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3218,9 +3218,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.39" +version = "2.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "13fa70a4ee923979ffb522cacce59d34421ebdea5625e1073c4326ef9d2dd42e" dependencies = [ "proc-macro2", "quote", @@ -3299,7 +3299,7 @@ checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3367,9 +3367,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.34.0" +version = "1.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9" +checksum = "841d45b238a16291a4e1584e61820b8ae57d696cc5015c459c229ccc6990cc1c" dependencies = [ "backtrace", "bytes", @@ -3391,7 +3391,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3411,7 +3411,7 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.21.9", + "rustls 0.21.10", "tokio", ] @@ -3488,7 +3488,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3502,9 +3502,9 @@ dependencies = [ [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "twox-hash" @@ -3533,7 +3533,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] @@ -3544,9 +3544,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" +checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" [[package]] name = "unicode-ident" @@ -3687,7 +3687,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", "wasm-bindgen-shared", ] @@ -3721,7 +3721,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3970,22 +3970,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.29" +version = "0.7.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d075cf85bbb114e933343e087b92f2146bac0d55b534cbb8188becf0039948e" +checksum = "306dca4455518f1f31635ec308b6b3e4eb1b11758cefafc782827d0aa7acb5c7" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.29" +version = "0.7.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86cd5ca076997b97ef09d3ad65efe811fa68c9e874cb636ccb211223a813b0c2" +checksum = "be912bf68235a88fbefd1b73415cb218405958d1655b2ece9035a19920bdf6ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.39", + "syn 2.0.40", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 5ce318aea3ac..1bf24808fb90 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "33.0.0" +version = "34.0.0" authors = ["Apache Arrow "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -34,7 +34,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "33.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "parquet", "regex_expressions", "unicode_expressions", "compression"] } +datafusion = { path = "../datafusion/core", version = "34.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "parquet", "regex_expressions", "unicode_expressions", "compression"] } dirs = "4.0.0" env_logger = "0.9" mimalloc = { version = "0.1", default-features = false } diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index e224b9387655..d64bbeda877d 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -19,6 +19,7 @@ # Changelog +- [34.0.0](../dev/changelog/34.0.0.md) - [33.0.0](../dev/changelog/33.0.0.md) - [32.0.0](../dev/changelog/32.0.0.md) - [31.0.0](../dev/changelog/31.0.0.md) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 7caf91e24f2f..0ee83e756745 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -62,11 +62,11 @@ bytes = { workspace = true } bzip2 = { version = "0.4.3", optional = true } chrono = { workspace = true } dashmap = { workspace = true } -datafusion-common = { path = "../common", version = "33.0.0", features = ["object_store"], default-features = false } +datafusion-common = { path = "../common", version = "34.0.0", features = ["object_store"], default-features = false } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-optimizer = { path = "../optimizer", version = "33.0.0", default-features = false } -datafusion-physical-expr = { path = "../physical-expr", version = "33.0.0", default-features = false } +datafusion-optimizer = { path = "../optimizer", version = "34.0.0", default-features = false } +datafusion-physical-expr = { path = "../physical-expr", version = "34.0.0", default-features = false } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } flate2 = { version = "1.0.24", optional = true } diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index fac880867fef..b350d41d3fe3 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -44,7 +44,7 @@ async-trait = { workspace = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } -datafusion-physical-expr = { path = "../physical-expr", version = "33.0.0", default-features = false } +datafusion-physical-expr = { path = "../physical-expr", version = "34.0.0", default-features = false } hashbrown = { version = "0.14", features = ["raw"] } itertools = { workspace = true } log = { workspace = true } @@ -52,5 +52,5 @@ regex-syntax = "0.8.0" [dev-dependencies] ctor = { workspace = true } -datafusion-sql = { path = "../sql", version = "33.0.0" } +datafusion-sql = { path = "../sql", version = "34.0.0" } env_logger = "0.10.0" diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index fbd412aedaa5..f9f24b28db81 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -43,7 +43,7 @@ parquet = ["datafusion/parquet", "datafusion-common/parquet"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { path = "../core", version = "33.0.0" } +datafusion = { path = "../core", version = "34.0.0" } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } object_store = { workspace = true } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 436c6159e7a3..e333dc816f66 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -36,7 +36,7 @@ async-trait = { workspace = true } bigdecimal = { workspace = true } bytes = { version = "1.4.0", optional = true } chrono = { workspace = true, optional = true } -datafusion = { path = "../core", version = "33.0.0" } +datafusion = { path = "../core", version = "34.0.0" } datafusion-common = { workspace = true } futures = { version = "0.3.28" } half = { workspace = true } diff --git a/dev/changelog/34.0.0.md b/dev/changelog/34.0.0.md new file mode 100644 index 000000000000..8b8933017cfb --- /dev/null +++ b/dev/changelog/34.0.0.md @@ -0,0 +1,247 @@ + + +## [34.0.0](https://github.com/apache/arrow-datafusion/tree/34.0.0) (2023-12-11) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/33.0.0...34.0.0) + +**Breaking changes:** + +- Implement `DISTINCT ON` from Postgres [#7981](https://github.com/apache/arrow-datafusion/pull/7981) (gruuya) +- Encapsulate `EquivalenceClass` into a struct [#8034](https://github.com/apache/arrow-datafusion/pull/8034) (alamb) +- Make fields of `ScalarUDF` , `AggregateUDF` and `WindowUDF` non `pub` [#8079](https://github.com/apache/arrow-datafusion/pull/8079) (alamb) +- Implement StreamTable and StreamTableProvider (#7994) [#8021](https://github.com/apache/arrow-datafusion/pull/8021) (tustvold) +- feat: make FixedSizeList scalar also an ArrayRef [#8221](https://github.com/apache/arrow-datafusion/pull/8221) (wjones127) +- Remove FileWriterMode and ListingTableInsertMode (#7994) [#8017](https://github.com/apache/arrow-datafusion/pull/8017) (tustvold) +- Refactor: Unify `Expr::ScalarFunction` and `Expr::ScalarUDF`, introduce unresolved functions by name [#8258](https://github.com/apache/arrow-datafusion/pull/8258) (2010YOUY01) +- Refactor aggregate function handling [#8358](https://github.com/apache/arrow-datafusion/pull/8358) (Weijun-H) +- Move `PartitionSearchMode` into datafusion_physical_plan, rename to `InputOrderMode` [#8364](https://github.com/apache/arrow-datafusion/pull/8364) (alamb) +- Split `EmptyExec` into `PlaceholderRowExec` [#8446](https://github.com/apache/arrow-datafusion/pull/8446) (razeghi71) + +**Implemented enhancements:** + +- feat: show statistics in explain verbose [#8113](https://github.com/apache/arrow-datafusion/pull/8113) (NGA-TRAN) +- feat:implement postgres style 'overlay' string function [#8117](https://github.com/apache/arrow-datafusion/pull/8117) (Syleechan) +- feat: fill missing values with NULLs while inserting [#8146](https://github.com/apache/arrow-datafusion/pull/8146) (jonahgao) +- feat: to_array_of_size for ScalarValue::FixedSizeList [#8225](https://github.com/apache/arrow-datafusion/pull/8225) (wjones127) +- feat:implement calcite style 'levenshtein' string function [#8168](https://github.com/apache/arrow-datafusion/pull/8168) (Syleechan) +- feat: roundtrip FixedSizeList Scalar to protobuf [#8239](https://github.com/apache/arrow-datafusion/pull/8239) (wjones127) +- feat: impl the basic `string_agg` function [#8148](https://github.com/apache/arrow-datafusion/pull/8148) (haohuaijin) +- feat: support simplifying BinaryExpr with arbitrary guarantees in GuaranteeRewriter [#8256](https://github.com/apache/arrow-datafusion/pull/8256) (wjones127) +- feat: support customizing column default values for inserting [#8283](https://github.com/apache/arrow-datafusion/pull/8283) (jonahgao) +- feat:implement sql style 'substr_index' string function [#8272](https://github.com/apache/arrow-datafusion/pull/8272) (Syleechan) +- feat:implement sql style 'find_in_set' string function [#8328](https://github.com/apache/arrow-datafusion/pull/8328) (Syleechan) +- feat: support `LargeList` in `array_empty` [#8321](https://github.com/apache/arrow-datafusion/pull/8321) (Weijun-H) +- feat: support `LargeList` in `make_array` and `array_length` [#8121](https://github.com/apache/arrow-datafusion/pull/8121) (Weijun-H) +- feat: ScalarValue from String [#8411](https://github.com/apache/arrow-datafusion/pull/8411) (QuenKar) +- feat: support `LargeList` for `array_has`, `array_has_all` and `array_has_any` [#8322](https://github.com/apache/arrow-datafusion/pull/8322) (Weijun-H) +- feat: customize column default values for external tables [#8415](https://github.com/apache/arrow-datafusion/pull/8415) (jonahgao) +- feat: Support `array_sort`(`list_sort`) [#8279](https://github.com/apache/arrow-datafusion/pull/8279) (Asura7969) +- feat: support `InterleaveExecNode` in the proto [#8460](https://github.com/apache/arrow-datafusion/pull/8460) (liukun4515) + +**Fixed bugs:** + +- fix: Timestamp with timezone not considered `join on` [#8150](https://github.com/apache/arrow-datafusion/pull/8150) (ACking-you) +- fix: wrong result of range function [#8313](https://github.com/apache/arrow-datafusion/pull/8313) (smallzhongfeng) +- fix: make `ntile` work in some corner cases [#8371](https://github.com/apache/arrow-datafusion/pull/8371) (haohuaijin) +- fix: Changed labeler.yml to latest format [#8431](https://github.com/apache/arrow-datafusion/pull/8431) (viirya) +- fix: Literal in `ORDER BY` window definition should not be an ordinal referring to relation column [#8419](https://github.com/apache/arrow-datafusion/pull/8419) (viirya) +- fix: ORDER BY window definition should work on null literal [#8444](https://github.com/apache/arrow-datafusion/pull/8444) (viirya) +- fix: RANGE frame for corner cases with empty ORDER BY clause should be treated as constant sort [#8445](https://github.com/apache/arrow-datafusion/pull/8445) (viirya) +- fix: don't unifies projection if expr is non-trival [#8454](https://github.com/apache/arrow-datafusion/pull/8454) (haohuaijin) +- fix: support uppercase when parsing `Interval` [#8478](https://github.com/apache/arrow-datafusion/pull/8478) (QuenKar) + +**Documentation updates:** + +- Library Guide: Add Using the DataFrame API [#8319](https://github.com/apache/arrow-datafusion/pull/8319) (Veeupup) +- Minor: Add installation link to README.md [#8389](https://github.com/apache/arrow-datafusion/pull/8389) (Weijun-H) + +**Merged pull requests:** + +- Fix typo in partitioning.rs [#8134](https://github.com/apache/arrow-datafusion/pull/8134) (lewiszlw) +- Implement `DISTINCT ON` from Postgres [#7981](https://github.com/apache/arrow-datafusion/pull/7981) (gruuya) +- Prepare 33.0.0-rc2 [#8144](https://github.com/apache/arrow-datafusion/pull/8144) (andygrove) +- Avoid concat in `array_append` [#8137](https://github.com/apache/arrow-datafusion/pull/8137) (jayzhan211) +- Replace macro with function for array_remove [#8106](https://github.com/apache/arrow-datafusion/pull/8106) (jayzhan211) +- Implement `array_union` [#7897](https://github.com/apache/arrow-datafusion/pull/7897) (edmondop) +- Minor: Document `ExecutionPlan::equivalence_properties` more thoroughly [#8128](https://github.com/apache/arrow-datafusion/pull/8128) (alamb) +- feat: show statistics in explain verbose [#8113](https://github.com/apache/arrow-datafusion/pull/8113) (NGA-TRAN) +- feat:implement postgres style 'overlay' string function [#8117](https://github.com/apache/arrow-datafusion/pull/8117) (Syleechan) +- Minor: Encapsulate `LeftJoinData` into a struct (rather than anonymous enum) and add comments [#8153](https://github.com/apache/arrow-datafusion/pull/8153) (alamb) +- Update sqllogictest requirement from 0.18.0 to 0.19.0 [#8163](https://github.com/apache/arrow-datafusion/pull/8163) (dependabot[bot]) +- feat: fill missing values with NULLs while inserting [#8146](https://github.com/apache/arrow-datafusion/pull/8146) (jonahgao) +- Introduce return type for aggregate sum [#8141](https://github.com/apache/arrow-datafusion/pull/8141) (jayzhan211) +- implement range/generate_series func [#8140](https://github.com/apache/arrow-datafusion/pull/8140) (Veeupup) +- Encapsulate `EquivalenceClass` into a struct [#8034](https://github.com/apache/arrow-datafusion/pull/8034) (alamb) +- Revert "Minor: remove unnecessary projection in `single_distinct_to_g… [#8176](https://github.com/apache/arrow-datafusion/pull/8176) (NGA-TRAN) +- Preserve all of the valid orderings during merging. [#8169](https://github.com/apache/arrow-datafusion/pull/8169) (mustafasrepo) +- Make fields of `ScalarUDF` , `AggregateUDF` and `WindowUDF` non `pub` [#8079](https://github.com/apache/arrow-datafusion/pull/8079) (alamb) +- Fix logical conflicts [#8187](https://github.com/apache/arrow-datafusion/pull/8187) (tustvold) +- Minor: Update JoinHashMap comment example to make it clearer [#8154](https://github.com/apache/arrow-datafusion/pull/8154) (alamb) +- Implement StreamTable and StreamTableProvider (#7994) [#8021](https://github.com/apache/arrow-datafusion/pull/8021) (tustvold) +- [MINOR]: Remove unused Results [#8189](https://github.com/apache/arrow-datafusion/pull/8189) (mustafasrepo) +- Minor: clean up the code based on clippy [#8179](https://github.com/apache/arrow-datafusion/pull/8179) (Weijun-H) +- Minor: simplify filter statistics code [#8174](https://github.com/apache/arrow-datafusion/pull/8174) (alamb) +- Replace macro with function for `array_position` and `array_positions` [#8170](https://github.com/apache/arrow-datafusion/pull/8170) (jayzhan211) +- Add Library Guide for User Defined Functions: Window/Aggregate [#8171](https://github.com/apache/arrow-datafusion/pull/8171) (Veeupup) +- Add more stream docs [#8192](https://github.com/apache/arrow-datafusion/pull/8192) (tustvold) +- Implement func `array_pop_front` [#8142](https://github.com/apache/arrow-datafusion/pull/8142) (Veeupup) +- Moving arrow_files SQL tests to sqllogictest [#8217](https://github.com/apache/arrow-datafusion/pull/8217) (edmondop) +- fix regression in the use of name in ProjectionPushdown [#8219](https://github.com/apache/arrow-datafusion/pull/8219) (alamb) +- [MINOR]: Fix column indices in the planning tests [#8191](https://github.com/apache/arrow-datafusion/pull/8191) (mustafasrepo) +- Remove unnecessary reassignment [#8232](https://github.com/apache/arrow-datafusion/pull/8232) (qrilka) +- Update itertools requirement from 0.11 to 0.12 [#8233](https://github.com/apache/arrow-datafusion/pull/8233) (crepererum) +- Port tests in subqueries.rs to sqllogictest [#8231](https://github.com/apache/arrow-datafusion/pull/8231) (PsiACE) +- feat: make FixedSizeList scalar also an ArrayRef [#8221](https://github.com/apache/arrow-datafusion/pull/8221) (wjones127) +- Add versions to datafusion dependencies [#8238](https://github.com/apache/arrow-datafusion/pull/8238) (andygrove) +- feat: to_array_of_size for ScalarValue::FixedSizeList [#8225](https://github.com/apache/arrow-datafusion/pull/8225) (wjones127) +- feat:implement calcite style 'levenshtein' string function [#8168](https://github.com/apache/arrow-datafusion/pull/8168) (Syleechan) +- feat: roundtrip FixedSizeList Scalar to protobuf [#8239](https://github.com/apache/arrow-datafusion/pull/8239) (wjones127) +- Update prost-build requirement from =0.12.1 to =0.12.2 [#8244](https://github.com/apache/arrow-datafusion/pull/8244) (dependabot[bot]) +- Minor: Port tests in `displayable.rs` to sqllogictest [#8246](https://github.com/apache/arrow-datafusion/pull/8246) (Weijun-H) +- Minor: add `with_estimated_selectivity ` to Precision [#8177](https://github.com/apache/arrow-datafusion/pull/8177) (alamb) +- fix: Timestamp with timezone not considered `join on` [#8150](https://github.com/apache/arrow-datafusion/pull/8150) (ACking-you) +- Replace macro in array_array to remove duplicate codes [#8252](https://github.com/apache/arrow-datafusion/pull/8252) (Veeupup) +- Port tests in projection.rs to sqllogictest [#8240](https://github.com/apache/arrow-datafusion/pull/8240) (PsiACE) +- Introduce `array_except` function [#8135](https://github.com/apache/arrow-datafusion/pull/8135) (jayzhan211) +- Port tests in `describe.rs` to sqllogictest [#8242](https://github.com/apache/arrow-datafusion/pull/8242) (Asura7969) +- Remove FileWriterMode and ListingTableInsertMode (#7994) [#8017](https://github.com/apache/arrow-datafusion/pull/8017) (tustvold) +- Minor: clean up the code based on Clippy [#8257](https://github.com/apache/arrow-datafusion/pull/8257) (Weijun-H) +- Update arrow 49.0.0 and object_store 0.8.0 [#8029](https://github.com/apache/arrow-datafusion/pull/8029) (tustvold) +- feat: impl the basic `string_agg` function [#8148](https://github.com/apache/arrow-datafusion/pull/8148) (haohuaijin) +- Minor: Make schema of grouping set columns nullable [#8248](https://github.com/apache/arrow-datafusion/pull/8248) (markusa380) +- feat: support simplifying BinaryExpr with arbitrary guarantees in GuaranteeRewriter [#8256](https://github.com/apache/arrow-datafusion/pull/8256) (wjones127) +- Making stream joins extensible: A new Trait implementation for SHJ [#8234](https://github.com/apache/arrow-datafusion/pull/8234) (metesynnada) +- Don't Canonicalize Filesystem Paths in ListingTableUrl / support new external tables for files that do not (yet) exist [#8014](https://github.com/apache/arrow-datafusion/pull/8014) (tustvold) +- Minor: Add sql level test for inserting into non-existent directory [#8278](https://github.com/apache/arrow-datafusion/pull/8278) (alamb) +- Replace `array_has/array_has_all/array_has_any` macro to remove duplicate code [#8263](https://github.com/apache/arrow-datafusion/pull/8263) (Veeupup) +- Fix bug in field level metadata matching code [#8286](https://github.com/apache/arrow-datafusion/pull/8286) (alamb) +- Refactor Interval Arithmetic Updates [#8276](https://github.com/apache/arrow-datafusion/pull/8276) (berkaysynnada) +- [MINOR]: Remove unecessary orderings from the final plan [#8289](https://github.com/apache/arrow-datafusion/pull/8289) (mustafasrepo) +- consistent logical & physical `NTILE` return types [#8270](https://github.com/apache/arrow-datafusion/pull/8270) (korowa) +- make `array_union`/`array_except`/`array_intersect` handle empty/null arrays rightly [#8269](https://github.com/apache/arrow-datafusion/pull/8269) (Veeupup) +- improve file path validation when reading parquet [#8267](https://github.com/apache/arrow-datafusion/pull/8267) (Weijun-H) +- [Benchmarks] Make `partitions` default to number of cores instead of 2 [#8292](https://github.com/apache/arrow-datafusion/pull/8292) (andygrove) +- Update prost-build requirement from =0.12.2 to =0.12.3 [#8298](https://github.com/apache/arrow-datafusion/pull/8298) (dependabot[bot]) +- Fix Display for List [#8261](https://github.com/apache/arrow-datafusion/pull/8261) (jayzhan211) +- feat: support customizing column default values for inserting [#8283](https://github.com/apache/arrow-datafusion/pull/8283) (jonahgao) +- support `LargeList` for `arrow_cast`, support `ScalarValue::LargeList` [#8290](https://github.com/apache/arrow-datafusion/pull/8290) (Weijun-H) +- Minor: remove useless clone based on Clippy [#8300](https://github.com/apache/arrow-datafusion/pull/8300) (Weijun-H) +- Calculate ordering equivalence for expressions (rather than just columns) [#8281](https://github.com/apache/arrow-datafusion/pull/8281) (mustafasrepo) +- Fix sqllogictests link in contributor-guide/index.md [#8314](https://github.com/apache/arrow-datafusion/pull/8314) (qrilka) +- Refactor: Unify `Expr::ScalarFunction` and `Expr::ScalarUDF`, introduce unresolved functions by name [#8258](https://github.com/apache/arrow-datafusion/pull/8258) (2010YOUY01) +- Support no distinct aggregate sum/min/max in `single_distinct_to_group_by` rule [#8266](https://github.com/apache/arrow-datafusion/pull/8266) (haohuaijin) +- feat:implement sql style 'substr_index' string function [#8272](https://github.com/apache/arrow-datafusion/pull/8272) (Syleechan) +- Fixing issues with for timestamp literals [#8193](https://github.com/apache/arrow-datafusion/pull/8193) (comphead) +- Projection Pushdown over StreamingTableExec [#8299](https://github.com/apache/arrow-datafusion/pull/8299) (berkaysynnada) +- minor: fix documentation [#8323](https://github.com/apache/arrow-datafusion/pull/8323) (comphead) +- fix: wrong result of range function [#8313](https://github.com/apache/arrow-datafusion/pull/8313) (smallzhongfeng) +- Minor: rename parquet.rs to parquet/mod.rs [#8301](https://github.com/apache/arrow-datafusion/pull/8301) (alamb) +- refactor: output ordering [#8304](https://github.com/apache/arrow-datafusion/pull/8304) (QuenKar) +- Update substrait requirement from 0.19.0 to 0.20.0 [#8339](https://github.com/apache/arrow-datafusion/pull/8339) (dependabot[bot]) +- Port tests in `aggregates.rs` to sqllogictest [#8316](https://github.com/apache/arrow-datafusion/pull/8316) (edmondop) +- Library Guide: Add Using the DataFrame API [#8319](https://github.com/apache/arrow-datafusion/pull/8319) (Veeupup) +- Port tests in limit.rs to sqllogictest [#8315](https://github.com/apache/arrow-datafusion/pull/8315) (zhangxffff) +- move array function unit_tests to sqllogictest [#8332](https://github.com/apache/arrow-datafusion/pull/8332) (Veeupup) +- NTH_VALUE reverse support [#8327](https://github.com/apache/arrow-datafusion/pull/8327) (mustafasrepo) +- Optimize Projections during Logical Plan [#8340](https://github.com/apache/arrow-datafusion/pull/8340) (mustafasrepo) +- [MINOR]: Move merge projections tests to under optimize projections [#8352](https://github.com/apache/arrow-datafusion/pull/8352) (mustafasrepo) +- Add `quote` and `escape` attributes to create csv external table [#8351](https://github.com/apache/arrow-datafusion/pull/8351) (Asura7969) +- Minor: Add DataFrame test [#8341](https://github.com/apache/arrow-datafusion/pull/8341) (alamb) +- Minor: clean up the code based on Clippy [#8359](https://github.com/apache/arrow-datafusion/pull/8359) (Weijun-H) +- Minor: Make it easier to work with Expr::ScalarFunction [#8350](https://github.com/apache/arrow-datafusion/pull/8350) (alamb) +- Minor: Move some datafusion-optimizer::utils down to datafusion-expr::utils [#8354](https://github.com/apache/arrow-datafusion/pull/8354) (Jesse-Bakker) +- Minor: Make `BuiltInScalarFunction::alias` a method [#8349](https://github.com/apache/arrow-datafusion/pull/8349) (alamb) +- Extract parquet statistics to its own module, add tests [#8294](https://github.com/apache/arrow-datafusion/pull/8294) (alamb) +- feat:implement sql style 'find_in_set' string function [#8328](https://github.com/apache/arrow-datafusion/pull/8328) (Syleechan) +- Support LargeUtf8 to Temporal Coercion [#8357](https://github.com/apache/arrow-datafusion/pull/8357) (jayzhan211) +- Refactor aggregate function handling [#8358](https://github.com/apache/arrow-datafusion/pull/8358) (Weijun-H) +- Implement Aliases for ScalarUDF [#8360](https://github.com/apache/arrow-datafusion/pull/8360) (Veeupup) +- Minor: Remove unnecessary name field in `ScalarFunctionDefintion` [#8365](https://github.com/apache/arrow-datafusion/pull/8365) (alamb) +- feat: support `LargeList` in `array_empty` [#8321](https://github.com/apache/arrow-datafusion/pull/8321) (Weijun-H) +- Double type argument for to_timestamp function [#8159](https://github.com/apache/arrow-datafusion/pull/8159) (spaydar) +- Support User Defined Table Function [#8306](https://github.com/apache/arrow-datafusion/pull/8306) (Veeupup) +- Document timestamp input limits [#8369](https://github.com/apache/arrow-datafusion/pull/8369) (comphead) +- fix: make `ntile` work in some corner cases [#8371](https://github.com/apache/arrow-datafusion/pull/8371) (haohuaijin) +- Minor: Refactor array_union function to use a generic union_arrays function [#8381](https://github.com/apache/arrow-datafusion/pull/8381) (Weijun-H) +- Minor: Refactor function argument handling in `ScalarFunctionDefinition` [#8387](https://github.com/apache/arrow-datafusion/pull/8387) (Weijun-H) +- Materialize dictionaries in group keys [#8291](https://github.com/apache/arrow-datafusion/pull/8291) (qrilka) +- Rewrite `array_ndims` to fix List(Null) handling [#8320](https://github.com/apache/arrow-datafusion/pull/8320) (jayzhan211) +- Docs: Improve the documentation on `ScalarValue` [#8378](https://github.com/apache/arrow-datafusion/pull/8378) (alamb) +- Avoid concat for `array_replace` [#8337](https://github.com/apache/arrow-datafusion/pull/8337) (jayzhan211) +- add a summary table to benchmark compare output [#8399](https://github.com/apache/arrow-datafusion/pull/8399) (razeghi71) +- Refactors on TreeNode Implementations [#8395](https://github.com/apache/arrow-datafusion/pull/8395) (berkaysynnada) +- feat: support `LargeList` in `make_array` and `array_length` [#8121](https://github.com/apache/arrow-datafusion/pull/8121) (Weijun-H) +- remove `unalias` TableScan filters when create Physical Filter [#8404](https://github.com/apache/arrow-datafusion/pull/8404) (jackwener) +- Update custom-table-providers.md [#8409](https://github.com/apache/arrow-datafusion/pull/8409) (nickpoorman) +- fix transforming `LogicalPlan::Explain` use `TreeNode::transform` fails [#8400](https://github.com/apache/arrow-datafusion/pull/8400) (haohuaijin) +- Docs: Fix `array_except` documentation example error [#8407](https://github.com/apache/arrow-datafusion/pull/8407) (Asura7969) +- Support named query parameters [#8384](https://github.com/apache/arrow-datafusion/pull/8384) (Asura7969) +- Minor: Add installation link to README.md [#8389](https://github.com/apache/arrow-datafusion/pull/8389) (Weijun-H) +- Update code comment for the cases of regularized RANGE frame and add tests for ORDER BY cases with RANGE frame [#8410](https://github.com/apache/arrow-datafusion/pull/8410) (viirya) +- Minor: Add example with parameters to LogicalPlan [#8418](https://github.com/apache/arrow-datafusion/pull/8418) (alamb) +- Minor: Improve `PruningPredicate` documentation [#8394](https://github.com/apache/arrow-datafusion/pull/8394) (alamb) +- feat: ScalarValue from String [#8411](https://github.com/apache/arrow-datafusion/pull/8411) (QuenKar) +- Bump actions/labeler from 4.3.0 to 5.0.0 [#8422](https://github.com/apache/arrow-datafusion/pull/8422) (dependabot[bot]) +- Update sqlparser requirement from 0.39.0 to 0.40.0 [#8338](https://github.com/apache/arrow-datafusion/pull/8338) (dependabot[bot]) +- feat: support `LargeList` for `array_has`, `array_has_all` and `array_has_any` [#8322](https://github.com/apache/arrow-datafusion/pull/8322) (Weijun-H) +- Union `schema` can't be a subset of the child schema [#8408](https://github.com/apache/arrow-datafusion/pull/8408) (jackwener) +- Move `PartitionSearchMode` into datafusion_physical_plan, rename to `InputOrderMode` [#8364](https://github.com/apache/arrow-datafusion/pull/8364) (alamb) +- Make filter selectivity for statistics configurable [#8243](https://github.com/apache/arrow-datafusion/pull/8243) (edmondop) +- fix: Changed labeler.yml to latest format [#8431](https://github.com/apache/arrow-datafusion/pull/8431) (viirya) +- Minor: Use `ScalarValue::from` impl for strings [#8429](https://github.com/apache/arrow-datafusion/pull/8429) (alamb) +- Support crossjoin in substrait. [#8427](https://github.com/apache/arrow-datafusion/pull/8427) (my-vegetable-has-exploded) +- Fix ambiguous reference when aliasing in combination with `ORDER BY` [#8425](https://github.com/apache/arrow-datafusion/pull/8425) (Asura7969) +- Minor: convert marcro `list-slice` and `slice` to function [#8424](https://github.com/apache/arrow-datafusion/pull/8424) (Weijun-H) +- Remove macro in iter_to_array for List [#8414](https://github.com/apache/arrow-datafusion/pull/8414) (jayzhan211) +- fix: Literal in `ORDER BY` window definition should not be an ordinal referring to relation column [#8419](https://github.com/apache/arrow-datafusion/pull/8419) (viirya) +- feat: customize column default values for external tables [#8415](https://github.com/apache/arrow-datafusion/pull/8415) (jonahgao) +- feat: Support `array_sort`(`list_sort`) [#8279](https://github.com/apache/arrow-datafusion/pull/8279) (Asura7969) +- Bugfix: Remove df-cli specific SQL statment options before executing with DataFusion [#8426](https://github.com/apache/arrow-datafusion/pull/8426) (devinjdangelo) +- Detect when filters on unique constraints make subqueries scalar [#8312](https://github.com/apache/arrow-datafusion/pull/8312) (Jesse-Bakker) +- Add alias check to optimize projections merge [#8438](https://github.com/apache/arrow-datafusion/pull/8438) (mustafasrepo) +- Fix PartialOrd for ScalarValue::List/FixSizeList/LargeList [#8253](https://github.com/apache/arrow-datafusion/pull/8253) (jayzhan211) +- Support parquet_metadata for datafusion-cli [#8413](https://github.com/apache/arrow-datafusion/pull/8413) (Veeupup) +- Fix bug in optimizing a nested count [#8459](https://github.com/apache/arrow-datafusion/pull/8459) (Dandandan) +- Bump actions/setup-python from 4 to 5 [#8449](https://github.com/apache/arrow-datafusion/pull/8449) (dependabot[bot]) +- fix: ORDER BY window definition should work on null literal [#8444](https://github.com/apache/arrow-datafusion/pull/8444) (viirya) +- flx clippy warnings [#8455](https://github.com/apache/arrow-datafusion/pull/8455) (waynexia) +- fix: RANGE frame for corner cases with empty ORDER BY clause should be treated as constant sort [#8445](https://github.com/apache/arrow-datafusion/pull/8445) (viirya) +- Preserve `dict_id` on `Field` during serde roundtrip [#8457](https://github.com/apache/arrow-datafusion/pull/8457) (avantgardnerio) +- feat: support `InterleaveExecNode` in the proto [#8460](https://github.com/apache/arrow-datafusion/pull/8460) (liukun4515) +- [BUG FIX]: Proper Empty Batch handling in window execution [#8466](https://github.com/apache/arrow-datafusion/pull/8466) (mustafasrepo) +- Minor: update `cast` [#8458](https://github.com/apache/arrow-datafusion/pull/8458) (Weijun-H) +- fix: don't unifies projection if expr is non-trival [#8454](https://github.com/apache/arrow-datafusion/pull/8454) (haohuaijin) +- Minor: Add new bloom filter predicate tests [#8433](https://github.com/apache/arrow-datafusion/pull/8433) (alamb) +- Add PRIMARY KEY Aggregate support to dataframe API [#8356](https://github.com/apache/arrow-datafusion/pull/8356) (mustafasrepo) +- Minor: refactor `data_trunc` to reduce duplicated code [#8430](https://github.com/apache/arrow-datafusion/pull/8430) (Weijun-H) +- Support array_distinct function. [#8268](https://github.com/apache/arrow-datafusion/pull/8268) (my-vegetable-has-exploded) +- Add primary key support to stream table [#8467](https://github.com/apache/arrow-datafusion/pull/8467) (mustafasrepo) +- Add `evaluate_demo` and `range_analysis_demo` to Expr examples [#8377](https://github.com/apache/arrow-datafusion/pull/8377) (alamb) +- Minor: fix function name typo [#8473](https://github.com/apache/arrow-datafusion/pull/8473) (Weijun-H) +- Minor: Fix comment typo in table.rs: s/indentical/identical/ [#8469](https://github.com/apache/arrow-datafusion/pull/8469) (KeunwooLee-at) +- Remove `define_array_slice` and reuse `array_slice` for `array_pop_front/back` [#8401](https://github.com/apache/arrow-datafusion/pull/8401) (jayzhan211) +- Minor: refactor `trim` to clean up duplicated code [#8434](https://github.com/apache/arrow-datafusion/pull/8434) (Weijun-H) +- Split `EmptyExec` into `PlaceholderRowExec` [#8446](https://github.com/apache/arrow-datafusion/pull/8446) (razeghi71) +- Enable non-uniform field type for structs created in DataFusion [#8463](https://github.com/apache/arrow-datafusion/pull/8463) (dlovell) +- Minor: Add multi ordering test for array agg order [#8439](https://github.com/apache/arrow-datafusion/pull/8439) (jayzhan211) +- Sort filenames when reading parquet to ensure consistent schema [#6629](https://github.com/apache/arrow-datafusion/pull/6629) (thomas-k-cameron) +- Minor: Improve comments in EnforceDistribution tests [#8474](https://github.com/apache/arrow-datafusion/pull/8474) (alamb) +- fix: support uppercase when parsing `Interval` [#8478](https://github.com/apache/arrow-datafusion/pull/8478) (QuenKar) +- Better Equivalence (ordering and exact equivalence) Propagation through ProjectionExec [#8484](https://github.com/apache/arrow-datafusion/pull/8484) (mustafasrepo) diff --git a/docs/Cargo.toml b/docs/Cargo.toml index 4d01466924f9..813335e30f77 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -29,4 +29,4 @@ authors = { workspace = true } rust-version = "1.70" [dependencies] -datafusion = { path = "../datafusion/core", version = "33.0.0", default-features = false } +datafusion = { path = "../datafusion/core", version = "34.0.0", default-features = false } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index d5a43e429e09..6fb5cc4ca870 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -64,7 +64,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | NULL | Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | Sets maximum number of rows in a row group | -| datafusion.execution.parquet.created_by | datafusion version 33.0.0 | Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 34.0.0 | Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | From cf2de9b22c8e7c4c4b80bdb82ae1353e36a5af51 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 14 Dec 2023 10:42:16 +0800 Subject: [PATCH 14/22] refactor: use ExprBuilder to consume substrait expr and use macro to generate error (#8515) * refactor: use ExprBuilder to consume substrait expr Signed-off-by: Ruihang Xia * use macro to generate error Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- datafusion/common/src/error.rs | 3 + .../substrait/src/logical_plan/consumer.rs | 324 +++++++++--------- 2 files changed, 158 insertions(+), 169 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 4ae30ae86cdd..56b52bd73f9b 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -517,6 +517,9 @@ make_error!(not_impl_err, not_impl_datafusion_err, NotImplemented); // Exposes a macro to create `DataFusionError::Execution` make_error!(exec_err, exec_datafusion_err, Execution); +// Exposes a macro to create `DataFusionError::Substrait` +make_error!(substrait_err, substrait_datafusion_err, Substrait); + // Exposes a macro to create `DataFusionError::SQL` #[macro_export] macro_rules! sql_err { diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index ffc9d094ab91..f6b556fc6448 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -17,7 +17,9 @@ use async_recursion::async_recursion; use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; -use datafusion::common::{not_impl_err, DFField, DFSchema, DFSchemaRef}; +use datafusion::common::{ + not_impl_err, substrait_datafusion_err, substrait_err, DFField, DFSchema, DFSchemaRef, +}; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::{ @@ -73,16 +75,7 @@ use crate::variation_const::{ enum ScalarFunctionType { Builtin(BuiltinScalarFunction), Op(Operator), - /// [Expr::Not] - Not, - /// [Expr::Like] Used for filtering rows based on the given wildcard pattern. Case sensitive - Like, - /// [Expr::Like] Case insensitive operator counterpart of `Like` - ILike, - /// [Expr::IsNull] - IsNull, - /// [Expr::IsNotNull] - IsNotNull, + Expr(BuiltinExprBuilder), } pub fn name_to_op(name: &str) -> Result { @@ -127,14 +120,11 @@ fn scalar_function_type_from_str(name: &str) -> Result { return Ok(ScalarFunctionType::Builtin(fun)); } - match name { - "not" => Ok(ScalarFunctionType::Not), - "like" => Ok(ScalarFunctionType::Like), - "ilike" => Ok(ScalarFunctionType::ILike), - "is_null" => Ok(ScalarFunctionType::IsNull), - "is_not_null" => Ok(ScalarFunctionType::IsNotNull), - others => not_impl_err!("Unsupported function name: {others:?}"), + if let Some(builder) = BuiltinExprBuilder::try_from_name(name) { + return Ok(ScalarFunctionType::Expr(builder)); } + + not_impl_err!("Unsupported function name: {name:?}") } fn split_eq_and_noneq_join_predicate_with_nulls_equality( @@ -519,9 +509,7 @@ pub async fn from_substrait_rel( }, Some(RelType::ExtensionLeaf(extension)) => { let Some(ext_detail) = &extension.detail else { - return Err(DataFusionError::Substrait( - "Unexpected empty detail in ExtensionLeafRel".to_string(), - )); + return substrait_err!("Unexpected empty detail in ExtensionLeafRel"); }; let plan = ctx .state() @@ -531,18 +519,16 @@ pub async fn from_substrait_rel( } Some(RelType::ExtensionSingle(extension)) => { let Some(ext_detail) = &extension.detail else { - return Err(DataFusionError::Substrait( - "Unexpected empty detail in ExtensionSingleRel".to_string(), - )); + return substrait_err!("Unexpected empty detail in ExtensionSingleRel"); }; let plan = ctx .state() .serializer_registry() .deserialize_logical_plan(&ext_detail.type_url, &ext_detail.value)?; let Some(input_rel) = &extension.input else { - return Err(DataFusionError::Substrait( - "ExtensionSingleRel doesn't contains input rel. Try use ExtensionLeafRel instead".to_string() - )); + return substrait_err!( + "ExtensionSingleRel doesn't contains input rel. Try use ExtensionLeafRel instead" + ); }; let input_plan = from_substrait_rel(ctx, input_rel, extensions).await?; let plan = plan.from_template(&plan.expressions(), &[input_plan]); @@ -550,9 +536,7 @@ pub async fn from_substrait_rel( } Some(RelType::ExtensionMulti(extension)) => { let Some(ext_detail) = &extension.detail else { - return Err(DataFusionError::Substrait( - "Unexpected empty detail in ExtensionSingleRel".to_string(), - )); + return substrait_err!("Unexpected empty detail in ExtensionSingleRel"); }; let plan = ctx .state() @@ -881,64 +865,8 @@ pub async fn from_substrait_rex( ), } } - ScalarFunctionType::Not => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait( - "expect one argument for `NOT` expr".to_string(), - ) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = from_substrait_rex(e, input_schema, extensions) - .await? - .as_ref() - .clone(); - Ok(Arc::new(Expr::Not(Box::new(expr)))) - } - _ => not_impl_err!("Invalid arguments for Not expression"), - } - } - ScalarFunctionType::Like => { - make_datafusion_like(false, f, input_schema, extensions).await - } - ScalarFunctionType::ILike => { - make_datafusion_like(true, f, input_schema, extensions).await - } - ScalarFunctionType::IsNull => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait( - "expect one argument for `IS NULL` expr".to_string(), - ) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = from_substrait_rex(e, input_schema, extensions) - .await? - .as_ref() - .clone(); - Ok(Arc::new(Expr::IsNull(Box::new(expr)))) - } - _ => not_impl_err!("Invalid arguments for IS NULL expression"), - } - } - ScalarFunctionType::IsNotNull => { - let arg = f.arguments.first().ok_or_else(|| { - DataFusionError::Substrait( - "expect one argument for `IS NOT NULL` expr".to_string(), - ) - })?; - match &arg.arg_type { - Some(ArgType::Value(e)) => { - let expr = from_substrait_rex(e, input_schema, extensions) - .await? - .as_ref() - .clone(); - Ok(Arc::new(Expr::IsNotNull(Box::new(expr)))) - } - _ => { - not_impl_err!("Invalid arguments for IS NOT NULL expression") - } - } + ScalarFunctionType::Expr(builder) => { + builder.build(f, input_schema, extensions).await } } } @@ -960,9 +888,7 @@ pub async fn from_substrait_rex( ), from_substrait_type(output_type)?, )))), - None => Err(DataFusionError::Substrait( - "Cast experssion without output type is not allowed".to_string(), - )), + None => substrait_err!("Cast experssion without output type is not allowed"), }, Some(RexType::WindowFunction(window)) => { let fun = match extensions.get(&window.function_reference) { @@ -1087,9 +1013,7 @@ fn from_substrait_type(dt: &substrait::proto::Type) -> Result { r#type::Kind::List(list) => { let inner_type = from_substrait_type(list.r#type.as_ref().ok_or_else(|| { - DataFusionError::Substrait( - "List type must have inner type".to_string(), - ) + substrait_datafusion_err!("List type must have inner type") })?)?; let field = Arc::new(Field::new("list_item", inner_type, true)); match list.type_variation_reference { @@ -1141,9 +1065,7 @@ fn from_substrait_bound( } } }, - None => Err(DataFusionError::Substrait( - "WindowFunction missing Substrait Bound kind".to_string(), - )), + None => substrait_err!("WindowFunction missing Substrait Bound kind"), }, None => { if is_lower { @@ -1162,36 +1084,28 @@ pub(crate) fn from_substrait_literal(lit: &Literal) -> Result { DEFAULT_TYPE_REF => ScalarValue::Int8(Some(*n as i8)), UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt8(Some(*n as u8)), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::I16(n)) => match lit.type_variation_reference { DEFAULT_TYPE_REF => ScalarValue::Int16(Some(*n as i16)), UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt16(Some(*n as u16)), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::I32(n)) => match lit.type_variation_reference { DEFAULT_TYPE_REF => ScalarValue::Int32(Some(*n)), UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt32(Some(*n as u32)), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::I64(n)) => match lit.type_variation_reference { DEFAULT_TYPE_REF => ScalarValue::Int64(Some(*n)), UNSIGNED_INTEGER_TYPE_REF => ScalarValue::UInt64(Some(*n as u64)), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::Fp32(f)) => ScalarValue::Float32(Some(*f)), @@ -1202,9 +1116,7 @@ pub(crate) fn from_substrait_literal(lit: &Literal) -> Result { TIMESTAMP_MICRO_TYPE_REF => ScalarValue::TimestampMicrosecond(Some(*t), None), TIMESTAMP_NANO_TYPE_REF => ScalarValue::TimestampNanosecond(Some(*t), None), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::Date(d)) => ScalarValue::Date32(Some(*d)), @@ -1212,38 +1124,30 @@ pub(crate) fn from_substrait_literal(lit: &Literal) -> Result { DEFAULT_CONTAINER_TYPE_REF => ScalarValue::Utf8(Some(s.clone())), LARGE_CONTAINER_TYPE_REF => ScalarValue::LargeUtf8(Some(s.clone())), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::Binary(b)) => match lit.type_variation_reference { DEFAULT_CONTAINER_TYPE_REF => ScalarValue::Binary(Some(b.clone())), LARGE_CONTAINER_TYPE_REF => ScalarValue::LargeBinary(Some(b.clone())), others => { - return Err(DataFusionError::Substrait(format!( - "Unknown type variation reference {others}", - ))); + return substrait_err!("Unknown type variation reference {others}"); } }, Some(LiteralType::FixedBinary(b)) => { ScalarValue::FixedSizeBinary(b.len() as _, Some(b.clone())) } Some(LiteralType::Decimal(d)) => { - let value: [u8; 16] = - d.value - .clone() - .try_into() - .or(Err(DataFusionError::Substrait( - "Failed to parse decimal value".to_string(), - )))?; + let value: [u8; 16] = d + .value + .clone() + .try_into() + .or(substrait_err!("Failed to parse decimal value"))?; let p = d.precision.try_into().map_err(|e| { - DataFusionError::Substrait(format!( - "Failed to parse decimal precision: {e}" - )) + substrait_datafusion_err!("Failed to parse decimal precision: {e}") })?; let s = d.scale.try_into().map_err(|e| { - DataFusionError::Substrait(format!("Failed to parse decimal scale: {e}")) + substrait_datafusion_err!("Failed to parse decimal scale: {e}") })?; ScalarValue::Decimal128( Some(std::primitive::i128::from_le_bytes(value)), @@ -1341,50 +1245,132 @@ fn from_substrait_null(null_type: &Type) -> Result { } } -async fn make_datafusion_like( - case_insensitive: bool, - f: &ScalarFunction, - input_schema: &DFSchema, - extensions: &HashMap, -) -> Result> { - let fn_name = if case_insensitive { "ILIKE" } else { "LIKE" }; - if f.arguments.len() != 3 { - return not_impl_err!("Expect three arguments for `{fn_name}` expr"); +/// Build [`Expr`] from its name and required inputs. +struct BuiltinExprBuilder { + expr_name: String, +} + +impl BuiltinExprBuilder { + pub fn try_from_name(name: &str) -> Option { + match name { + "not" | "like" | "ilike" | "is_null" | "is_not_null" => Some(Self { + expr_name: name.to_string(), + }), + _ => None, + } } - let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { - return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); - }; - let expr = from_substrait_rex(expr_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); - let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else { - return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); - }; - let pattern = from_substrait_rex(pattern_substrait, input_schema, extensions) - .await? - .as_ref() - .clone(); - let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type else { - return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); - }; - let escape_char_expr = - from_substrait_rex(escape_char_substrait, input_schema, extensions) + pub async fn build( + self, + f: &ScalarFunction, + input_schema: &DFSchema, + extensions: &HashMap, + ) -> Result> { + match self.expr_name.as_str() { + "not" => Self::build_not_expr(f, input_schema, extensions).await, + "like" => Self::build_like_expr(false, f, input_schema, extensions).await, + "ilike" => Self::build_like_expr(true, f, input_schema, extensions).await, + "is_null" => { + Self::build_is_null_expr(false, f, input_schema, extensions).await + } + "is_not_null" => { + Self::build_is_null_expr(true, f, input_schema, extensions).await + } + _ => { + not_impl_err!("Unsupported builtin expression: {}", self.expr_name) + } + } + } + + async fn build_not_expr( + f: &ScalarFunction, + input_schema: &DFSchema, + extensions: &HashMap, + ) -> Result> { + if f.arguments.len() != 1 { + return not_impl_err!("Expect one argument for `NOT` expr"); + } + let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { + return not_impl_err!("Invalid arguments type for `NOT` expr"); + }; + let expr = from_substrait_rex(expr_substrait, input_schema, extensions) .await? .as_ref() .clone(); - let Expr::Literal(ScalarValue::Utf8(escape_char)) = escape_char_expr else { - return Err(DataFusionError::Substrait(format!( - "Expect Utf8 literal for escape char, but found {escape_char_expr:?}", - ))); - }; + Ok(Arc::new(Expr::Not(Box::new(expr)))) + } + + async fn build_like_expr( + case_insensitive: bool, + f: &ScalarFunction, + input_schema: &DFSchema, + extensions: &HashMap, + ) -> Result> { + let fn_name = if case_insensitive { "ILIKE" } else { "LIKE" }; + if f.arguments.len() != 3 { + return not_impl_err!("Expect three arguments for `{fn_name}` expr"); + } + + let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { + return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); + }; + let expr = from_substrait_rex(expr_substrait, input_schema, extensions) + .await? + .as_ref() + .clone(); + let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else { + return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); + }; + let pattern = from_substrait_rex(pattern_substrait, input_schema, extensions) + .await? + .as_ref() + .clone(); + let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type else { + return not_impl_err!("Invalid arguments type for `{fn_name}` expr"); + }; + let escape_char_expr = + from_substrait_rex(escape_char_substrait, input_schema, extensions) + .await? + .as_ref() + .clone(); + let Expr::Literal(ScalarValue::Utf8(escape_char)) = escape_char_expr else { + return substrait_err!( + "Expect Utf8 literal for escape char, but found {escape_char_expr:?}" + ); + }; - Ok(Arc::new(Expr::Like(Like { - negated: false, - expr: Box::new(expr), - pattern: Box::new(pattern), - escape_char: escape_char.map(|c| c.chars().next().unwrap()), - case_insensitive, - }))) + Ok(Arc::new(Expr::Like(Like { + negated: false, + expr: Box::new(expr), + pattern: Box::new(pattern), + escape_char: escape_char.map(|c| c.chars().next().unwrap()), + case_insensitive, + }))) + } + + async fn build_is_null_expr( + is_not: bool, + f: &ScalarFunction, + input_schema: &DFSchema, + extensions: &HashMap, + ) -> Result> { + let fn_name = if is_not { "IS NOT NULL" } else { "IS NULL" }; + let arg = f.arguments.first().ok_or_else(|| { + substrait_datafusion_err!("expect one argument for `{fn_name}` expr") + })?; + match &arg.arg_type { + Some(ArgType::Value(e)) => { + let expr = from_substrait_rex(e, input_schema, extensions) + .await? + .as_ref() + .clone(); + if is_not { + Ok(Arc::new(Expr::IsNotNull(Box::new(expr)))) + } else { + Ok(Arc::new(Expr::IsNull(Box::new(expr)))) + } + } + _ => substrait_err!("Invalid arguments for `{fn_name}` expression"), + } + } } From 79c17e3f1a9ddc95ad787963524b8c702548ac29 Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Thu, 14 Dec 2023 09:09:51 +0300 Subject: [PATCH 15/22] Make tests deterministic (#8525) --- .../sqllogictest/test_files/distinct_on.slt | 8 ++--- .../sqllogictest/test_files/groupby.slt | 32 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index 8a36b49b98c6..9a7117b69b99 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -38,9 +38,9 @@ LOCATION '../../testing/data/csv/aggregate_test_100.csv' # Basic example: distinct on the first column project the second one, and # order by the third query TI -SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1, c3; +SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1, c3, c9; ---- -a 5 +a 4 b 4 c 2 d 1 @@ -48,7 +48,7 @@ e 3 # Basic example + reverse order of the selected column query TI -SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1, c3 DESC; +SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1, c3 DESC, c9; ---- a 1 b 5 @@ -58,7 +58,7 @@ e 1 # Basic example + reverse order of the ON column query TI -SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1 DESC, c3; +SELECT DISTINCT ON (c1) c1, c2 FROM aggregate_test_100 ORDER BY c1 DESC, c3, c9; ---- e 3 d 1 diff --git a/datafusion/sqllogictest/test_files/groupby.slt b/datafusion/sqllogictest/test_files/groupby.slt index 8ed5245ef09b..b915c439059b 100644 --- a/datafusion/sqllogictest/test_files/groupby.slt +++ b/datafusion/sqllogictest/test_files/groupby.slt @@ -2329,15 +2329,15 @@ ProjectionExec: expr=[country@0 as country, ARRAY_AGG(s.amount) ORDER BY [s.amou ----SortExec: expr=[amount@1 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query T?R +query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, SUM(s.amount) AS sum1 FROM sales_global AS s GROUP BY s.country ---- FRA [200.0, 50.0] 250 -TUR [100.0, 75.0] 175 GRC [80.0, 30.0] 110 +TUR [100.0, 75.0] 175 # test_ordering_sensitive_aggregation3 # When different aggregators have conflicting requirements, we cannot satisfy all of them in current implementation. @@ -2373,7 +2373,7 @@ ProjectionExec: expr=[country@0 as country, ARRAY_AGG(s.amount) ORDER BY [s.amou ----SortExec: expr=[country@0 ASC NULLS LAST,amount@1 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query T?R +query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, SUM(s.amount) AS sum1 FROM (SELECT * @@ -2409,7 +2409,7 @@ ProjectionExec: expr=[country@0 as country, zip_code@1 as zip_code, ARRAY_AGG(s. ----SortExec: expr=[country@1 ASC NULLS LAST,amount@2 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query TI?R +query TI?R rowsort SELECT s.country, s.zip_code, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, SUM(s.amount) AS sum1 FROM (SELECT * @@ -2445,7 +2445,7 @@ ProjectionExec: expr=[country@0 as country, ARRAY_AGG(s.amount) ORDER BY [s.coun ----SortExec: expr=[country@0 ASC NULLS LAST] ------MemoryExec: partitions=1, partition_sizes=[1] -query T?R +query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.amount DESC) AS amounts, SUM(s.amount) AS sum1 FROM (SELECT * @@ -2480,7 +2480,7 @@ ProjectionExec: expr=[country@0 as country, ARRAY_AGG(s.amount) ORDER BY [s.coun ----SortExec: expr=[country@0 ASC NULLS LAST,amount@1 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query T?R +query T?R rowsort SELECT s.country, ARRAY_AGG(s.amount ORDER BY s.country DESC, s.amount DESC) AS amounts, SUM(s.amount) AS sum1 FROM (SELECT * @@ -2512,7 +2512,7 @@ ProjectionExec: expr=[country@0 as country, ARRAY_AGG(sales_global.amount) ORDER ----SortExec: expr=[amount@1 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query T?RR +query T?RR rowsort SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, FIRST_VALUE(amount ORDER BY amount ASC) AS fv1, LAST_VALUE(amount ORDER BY amount DESC) AS fv2 @@ -2520,8 +2520,8 @@ SELECT country, ARRAY_AGG(amount ORDER BY amount DESC) AS amounts, GROUP BY country ---- FRA [200.0, 50.0] 50 50 -TUR [100.0, 75.0] 75 75 GRC [80.0, 30.0] 30 30 +TUR [100.0, 75.0] 75 75 # test_reverse_aggregate_expr2 # Some of the Aggregators can be reversed, by this way we can still run aggregators without re-ordering @@ -2640,7 +2640,7 @@ ProjectionExec: expr=[country@0 as country, FIRST_VALUE(sales_global.amount) ORD ----SortExec: expr=[ts@1 ASC NULLS LAST] ------MemoryExec: partitions=1, partition_sizes=[1] -query TRRR +query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, LAST_VALUE(amount ORDER BY ts DESC) as lv1, SUM(amount ORDER BY ts DESC) as sum1 @@ -2649,8 +2649,8 @@ SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, ORDER BY ts ASC) GROUP BY country ---- -GRC 80 30 110 FRA 200 50 250 +GRC 80 30 110 TUR 100 75 175 # If existing ordering doesn't satisfy requirement, we should do calculations @@ -2674,16 +2674,16 @@ ProjectionExec: expr=[country@0 as country, FIRST_VALUE(sales_global.amount) ORD ----SortExec: expr=[ts@1 DESC] ------MemoryExec: partitions=1, partition_sizes=[1] -query TRRR +query TRRR rowsort SELECT country, FIRST_VALUE(amount ORDER BY ts DESC) as fv1, LAST_VALUE(amount ORDER BY ts DESC) as lv1, SUM(amount ORDER BY ts DESC) as sum1 FROM sales_global GROUP BY country ---- -TUR 100 75 175 -GRC 80 30 110 FRA 200 50 250 +GRC 80 30 110 +TUR 100 75 175 query TT EXPLAIN SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate @@ -2715,7 +2715,7 @@ SortExec: expr=[sn@2 ASC NULLS LAST] --------------MemoryExec: partitions=1, partition_sizes=[1] --------------MemoryExec: partitions=1, partition_sizes=[1] -query ITIPTR +query ITIPTR rowsort SELECT s.zip_code, s.country, s.sn, s.ts, s.currency, LAST_VALUE(e.amount ORDER BY e.sn) AS last_rate FROM sales_global AS s JOIN sales_global AS e @@ -2725,10 +2725,10 @@ GROUP BY s.sn, s.zip_code, s.country, s.ts, s.currency ORDER BY s.sn ---- 0 GRC 0 2022-01-01T06:00:00 EUR 30 +0 GRC 4 2022-01-03T10:00:00 EUR 80 1 FRA 1 2022-01-01T08:00:00 EUR 50 -1 TUR 2 2022-01-01T11:30:00 TRY 75 1 FRA 3 2022-01-02T12:00:00 EUR 200 -0 GRC 4 2022-01-03T10:00:00 EUR 80 +1 TUR 2 2022-01-01T11:30:00 TRY 75 1 TUR 4 2022-01-03T10:00:00 TRY 100 # Run order-sensitive aggregators in multiple partitions From 5909866bba3e23e5f807972b84de526a4eb16c4c Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 14 Dec 2023 00:53:56 -0800 Subject: [PATCH 16/22] fix: volatile expressions should not be target of common subexpt elimination (#8520) * fix: volatile expressions should not be target of common subexpt elimination * Fix clippy * For review * Return error for unresolved scalar function * Improve error message --- datafusion/expr/src/expr.rs | 75 ++++++++++++++++++- .../optimizer/src/common_subexpr_eliminate.rs | 18 +++-- .../sqllogictest/test_files/functions.slt | 6 ++ 3 files changed, 91 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 958f4f4a3456..f0aab95b8f0d 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -373,6 +373,24 @@ impl ScalarFunctionDefinition { ScalarFunctionDefinition::Name(func_name) => func_name.as_ref(), } } + + /// Whether this function is volatile, i.e. whether it can return different results + /// when evaluated multiple times with the same input. + pub fn is_volatile(&self) -> Result { + match self { + ScalarFunctionDefinition::BuiltIn(fun) => { + Ok(fun.volatility() == crate::Volatility::Volatile) + } + ScalarFunctionDefinition::UDF(udf) => { + Ok(udf.signature().volatility == crate::Volatility::Volatile) + } + ScalarFunctionDefinition::Name(func) => { + internal_err!( + "Cannot determine volatility of unresolved function: {func}" + ) + } + } + } } impl ScalarFunction { @@ -1692,14 +1710,28 @@ fn create_names(exprs: &[Expr]) -> Result { .join(", ")) } +/// Whether the given expression is volatile, i.e. whether it can return different results +/// when evaluated multiple times with the same input. +pub fn is_volatile(expr: &Expr) -> Result { + match expr { + Expr::ScalarFunction(func) => func.func_def.is_volatile(), + _ => Ok(false), + } +} + #[cfg(test)] mod test { use crate::expr::Cast; use crate::expr_fn::col; - use crate::{case, lit, Expr}; + use crate::{ + case, lit, BuiltinScalarFunction, ColumnarValue, Expr, ReturnTypeFunction, + ScalarFunctionDefinition, ScalarFunctionImplementation, ScalarUDF, Signature, + Volatility, + }; use arrow::datatypes::DataType; use datafusion_common::Column; use datafusion_common::{Result, ScalarValue}; + use std::sync::Arc; #[test] fn format_case_when() -> Result<()> { @@ -1800,4 +1832,45 @@ mod test { "UInt32(1) OR UInt32(2)" ); } + + #[test] + fn test_is_volatile_scalar_func_definition() { + // BuiltIn + assert!( + ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::Random) + .is_volatile() + .unwrap() + ); + assert!( + !ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::Abs) + .is_volatile() + .unwrap() + ); + + // UDF + let return_type: ReturnTypeFunction = + Arc::new(move |_| Ok(Arc::new(DataType::Utf8))); + let fun: ScalarFunctionImplementation = + Arc::new(move |_| Ok(ColumnarValue::Scalar(ScalarValue::new_utf8("a")))); + let udf = Arc::new(ScalarUDF::new( + "TestScalarUDF", + &Signature::uniform(1, vec![DataType::Float32], Volatility::Stable), + &return_type, + &fun, + )); + assert!(!ScalarFunctionDefinition::UDF(udf).is_volatile().unwrap()); + + let udf = Arc::new(ScalarUDF::new( + "TestScalarUDF", + &Signature::uniform(1, vec![DataType::Float32], Volatility::Volatile), + &return_type, + &fun, + )); + assert!(ScalarFunctionDefinition::UDF(udf).is_volatile().unwrap()); + + // Unresolved function + ScalarFunctionDefinition::Name(Arc::from("UnresolvedFunc")) + .is_volatile() + .expect_err("Shouldn't determine volatility of unresolved function"); + } } diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 1d21407a6985..1e089257c61a 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -29,7 +29,7 @@ use datafusion_common::tree_node::{ use datafusion_common::{ internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, }; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{is_volatile, Alias}; use datafusion_expr::logical_plan::{ Aggregate, Filter, LogicalPlan, Projection, Sort, Window, }; @@ -113,6 +113,8 @@ impl CommonSubexprEliminate { let Projection { expr, input, .. } = projection; let input_schema = Arc::clone(input.schema()); let mut expr_set = ExprSet::new(); + + // Visit expr list and build expr identifier to occuring count map (`expr_set`). let arrays = to_arrays(expr, input_schema, &mut expr_set, ExprMask::Normal)?; let (mut new_expr, new_input) = @@ -516,7 +518,7 @@ enum ExprMask { } impl ExprMask { - fn ignores(&self, expr: &Expr) -> bool { + fn ignores(&self, expr: &Expr) -> Result { let is_normal_minus_aggregates = matches!( expr, Expr::Literal(..) @@ -527,12 +529,14 @@ impl ExprMask { | Expr::Wildcard { .. } ); + let is_volatile = is_volatile(expr)?; + let is_aggr = matches!(expr, Expr::AggregateFunction(..)); - match self { - Self::Normal => is_normal_minus_aggregates || is_aggr, - Self::NormalAndAggregates => is_normal_minus_aggregates, - } + Ok(match self { + Self::Normal => is_volatile || is_normal_minus_aggregates || is_aggr, + Self::NormalAndAggregates => is_volatile || is_normal_minus_aggregates, + }) } } @@ -624,7 +628,7 @@ impl TreeNodeVisitor for ExprIdentifierVisitor<'_> { let (idx, sub_expr_desc) = self.pop_enter_mark(); // skip exprs should not be recognize. - if self.expr_mask.ignores(expr) { + if self.expr_mask.ignores(expr)? { self.id_array[idx].0 = self.series_number; let desc = Self::desc_expr(expr); self.visit_stack.push(VisitRecord::ExprItem(desc)); diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 4f55ea316bb9..1903088b0748 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -995,3 +995,9 @@ query ? SELECT find_in_set(NULL, NULL) ---- NULL + +# Verify that multiple calls to volatile functions like `random()` are not combined / optimized away +query B +SELECT r FROM (SELECT r1 == r2 r, r1, r2 FROM (SELECT random() r1, random() r2) WHERE r1 > 0 AND r2 > 0) +---- +false From 831b2ba6e03a601d773a5ea8a3d2cb5fcb7a7da6 Mon Sep 17 00:00:00 2001 From: Xu Chen Date: Thu, 14 Dec 2023 22:40:51 +0800 Subject: [PATCH 17/22] Add LakeSoul to the list of Known Users (#8536) Signed-off-by: chenxu Co-authored-by: chenxu --- docs/source/user-guide/introduction.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md index da250fbb1f9c..6c1e54c2b701 100644 --- a/docs/source/user-guide/introduction.md +++ b/docs/source/user-guide/introduction.md @@ -106,6 +106,7 @@ Here are some active projects using DataFusion: - [GlareDB](https://github.com/GlareDB/glaredb) Fast SQL database for querying and analyzing distributed data. - [InfluxDB IOx](https://github.com/influxdata/influxdb_iox) Time Series Database - [Kamu](https://github.com/kamu-data/kamu-cli/) Planet-scale streaming data pipeline +- [LakeSoul](https://github.com/lakesoul-io/LakeSoul) Open source LakeHouse framework with native IO in Rust. - [Lance](https://github.com/lancedb/lance) Modern columnar data format for ML - [Parseable](https://github.com/parseablehq/parseable) Log storage and observability platform - [qv](https://github.com/timvw/qv) Quickly view your data From 974d49c907c5ddb250895efaef0952ed063f1831 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Dec 2023 09:58:20 -0500 Subject: [PATCH 18/22] Fix regression with Incorrect results when reading parquet files with different schemas and statistics (#8533) * Add test for schema evolution * Fix reading parquet statistics * Update tests for fix * Add comments to help explain the test * Add another test --- .../datasource/physical_plan/parquet/mod.rs | 9 +- .../physical_plan/parquet/row_groups.rs | 140 ++++++++++++++---- .../test_files/schema_evolution.slt | 140 ++++++++++++++++++ 3 files changed, 256 insertions(+), 33 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/schema_evolution.slt diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 718f9f820af1..641b7bbb1596 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -468,8 +468,10 @@ impl FileOpener for ParquetOpener { ParquetRecordBatchStreamBuilder::new_with_options(reader, options) .await?; + let file_schema = builder.schema().clone(); + let (schema_mapping, adapted_projections) = - schema_adapter.map_schema(builder.schema())?; + schema_adapter.map_schema(&file_schema)?; // let predicate = predicate.map(|p| reassign_predicate_columns(p, builder.schema(), true)).transpose()?; let mask = ProjectionMask::roots( @@ -481,8 +483,8 @@ impl FileOpener for ParquetOpener { if let Some(predicate) = pushdown_filters.then_some(predicate).flatten() { let row_filter = row_filter::build_row_filter( &predicate, - builder.schema().as_ref(), - table_schema.as_ref(), + &file_schema, + &table_schema, builder.metadata(), reorder_predicates, &file_metrics, @@ -507,6 +509,7 @@ impl FileOpener for ParquetOpener { let file_metadata = builder.metadata().clone(); let predicate = pruning_predicate.as_ref().map(|p| p.as_ref()); let mut row_groups = row_groups::prune_row_groups_by_statistics( + &file_schema, builder.parquet_schema(), file_metadata.row_groups(), file_range, diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs index 65414f5619a5..7c3f7d9384ab 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs @@ -55,6 +55,7 @@ use super::ParquetFileMetrics; /// Note: This method currently ignores ColumnOrder /// pub(crate) fn prune_row_groups_by_statistics( + arrow_schema: &Schema, parquet_schema: &SchemaDescriptor, groups: &[RowGroupMetaData], range: Option, @@ -80,7 +81,7 @@ pub(crate) fn prune_row_groups_by_statistics( let pruning_stats = RowGroupPruningStatistics { parquet_schema, row_group_metadata: metadata, - arrow_schema: predicate.schema().as_ref(), + arrow_schema, }; match predicate.prune(&pruning_stats) { Ok(values) => { @@ -416,11 +417,11 @@ mod tests { fn row_group_pruning_predicate_simple_expr() { use datafusion_expr::{col, lit}; // int > 1 => c1_max > 1 - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let schema = + Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)])); let expr = col("c1").gt(lit(15)); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let field = PrimitiveTypeField::new("c1", PhysicalType::INT32); let schema_descr = get_test_schema_descr(vec![field]); @@ -436,6 +437,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2], None, @@ -450,11 +452,11 @@ mod tests { fn row_group_pruning_predicate_missing_stats() { use datafusion_expr::{col, lit}; // int > 1 => c1_max > 1 - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let schema = + Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)])); let expr = col("c1").gt(lit(15)); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let field = PrimitiveTypeField::new("c1", PhysicalType::INT32); let schema_descr = get_test_schema_descr(vec![field]); @@ -471,6 +473,7 @@ mod tests { // is null / undefined so the first row group can't be filtered out assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2], None, @@ -519,6 +522,7 @@ mod tests { // when conditions are joined using AND assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, groups, None, @@ -532,12 +536,13 @@ mod tests { // this bypasses the entire predicate expression and no row groups are filtered out let expr = col("c1").gt(lit(15)).or(col("c2").rem(lit(2)).eq(lit(0))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = PruningPredicate::try_new(expr, schema).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); // if conditions in predicate are joined with OR and an unsupported expression is used // this bypasses the entire predicate expression and no row groups are filtered out assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, groups, None, @@ -548,6 +553,64 @@ mod tests { ); } + #[test] + fn row_group_pruning_predicate_file_schema() { + use datafusion_expr::{col, lit}; + // test row group predicate when file schema is different than table schema + // c1 > 0 + let table_schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ])); + let expr = col("c1").gt(lit(0)); + let expr = logical2physical(&expr, &table_schema); + let pruning_predicate = + PruningPredicate::try_new(expr, table_schema.clone()).unwrap(); + + // Model a file schema's column order c2 then c1, which is the opposite + // of the table schema + let file_schema = Arc::new(Schema::new(vec![ + Field::new("c2", DataType::Int32, false), + Field::new("c1", DataType::Int32, false), + ])); + let schema_descr = get_test_schema_descr(vec![ + PrimitiveTypeField::new("c2", PhysicalType::INT32), + PrimitiveTypeField::new("c1", PhysicalType::INT32), + ]); + // rg1 has c2 less than zero, c1 greater than zero + let rgm1 = get_row_group_meta_data( + &schema_descr, + vec![ + ParquetStatistics::int32(Some(-10), Some(-1), None, 0, false), // c2 + ParquetStatistics::int32(Some(1), Some(10), None, 0, false), + ], + ); + // rg1 has c2 greater than zero, c1 less than zero + let rgm2 = get_row_group_meta_data( + &schema_descr, + vec![ + ParquetStatistics::int32(Some(1), Some(10), None, 0, false), + ParquetStatistics::int32(Some(-10), Some(-1), None, 0, false), + ], + ); + + let metrics = parquet_file_metrics(); + let groups = &[rgm1, rgm2]; + // the first row group should be left because c1 is greater than zero + // the second should be filtered out because c1 is less than zero + assert_eq!( + prune_row_groups_by_statistics( + &file_schema, // NB must be file schema, not table_schema + &schema_descr, + groups, + None, + Some(&pruning_predicate), + &metrics + ), + vec![0] + ); + } + fn gen_row_group_meta_data_for_pruning_predicate() -> Vec { let schema_descr = get_test_schema_descr(vec![ PrimitiveTypeField::new("c1", PhysicalType::INT32), @@ -581,13 +644,14 @@ mod tests { let schema_descr = arrow_to_parquet_schema(&schema).unwrap(); let expr = col("c1").gt(lit(15)).and(col("c2").is_null()); let expr = logical2physical(&expr, &schema); - let pruning_predicate = PruningPredicate::try_new(expr, schema).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let groups = gen_row_group_meta_data_for_pruning_predicate(); let metrics = parquet_file_metrics(); // First row group was filtered out because it contains no null value on "c2". assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &groups, None, @@ -613,7 +677,7 @@ mod tests { .gt(lit(15)) .and(col("c2").eq(lit(ScalarValue::Boolean(None)))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = PruningPredicate::try_new(expr, schema).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let groups = gen_row_group_meta_data_for_pruning_predicate(); let metrics = parquet_file_metrics(); @@ -621,6 +685,7 @@ mod tests { // pass predicates. Ideally these should both be false assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &groups, None, @@ -639,8 +704,11 @@ mod tests { // INT32: c1 > 5, the c1 is decimal(9,2) // The type of scalar value if decimal(9,2), don't need to do cast - let schema = - Schema::new(vec![Field::new("c1", DataType::Decimal128(9, 2), false)]); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Decimal128(9, 2), + false, + )])); let field = PrimitiveTypeField::new("c1", PhysicalType::INT32) .with_logical_type(LogicalType::Decimal { scale: 2, @@ -651,8 +719,7 @@ mod tests { let schema_descr = get_test_schema_descr(vec![field]); let expr = col("c1").gt(lit(ScalarValue::Decimal128(Some(500), 9, 2))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let rgm1 = get_row_group_meta_data( &schema_descr, // [1.00, 6.00] @@ -680,6 +747,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2, rgm3], None, @@ -693,8 +761,11 @@ mod tests { // The c1 type is decimal(9,0) in the parquet file, and the type of scalar is decimal(5,2). // We should convert all type to the coercion type, which is decimal(11,2) // The decimal of arrow is decimal(5,2), the decimal of parquet is decimal(9,0) - let schema = - Schema::new(vec![Field::new("c1", DataType::Decimal128(9, 0), false)]); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Decimal128(9, 0), + false, + )])); let field = PrimitiveTypeField::new("c1", PhysicalType::INT32) .with_logical_type(LogicalType::Decimal { @@ -709,8 +780,7 @@ mod tests { Decimal128(11, 2), )); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let rgm1 = get_row_group_meta_data( &schema_descr, // [100, 600] @@ -744,6 +814,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2, rgm3, rgm4], None, @@ -754,8 +825,11 @@ mod tests { ); // INT64: c1 < 5, the c1 is decimal(18,2) - let schema = - Schema::new(vec![Field::new("c1", DataType::Decimal128(18, 2), false)]); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Decimal128(18, 2), + false, + )])); let field = PrimitiveTypeField::new("c1", PhysicalType::INT64) .with_logical_type(LogicalType::Decimal { scale: 2, @@ -766,8 +840,7 @@ mod tests { let schema_descr = get_test_schema_descr(vec![field]); let expr = col("c1").lt(lit(ScalarValue::Decimal128(Some(500), 18, 2))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); let rgm1 = get_row_group_meta_data( &schema_descr, // [6.00, 8.00] @@ -792,6 +865,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2, rgm3], None, @@ -803,8 +877,11 @@ mod tests { // FIXED_LENGTH_BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2) // the type of parquet is decimal(18,2) - let schema = - Schema::new(vec![Field::new("c1", DataType::Decimal128(18, 2), false)]); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Decimal128(18, 2), + false, + )])); let field = PrimitiveTypeField::new("c1", PhysicalType::FIXED_LEN_BYTE_ARRAY) .with_logical_type(LogicalType::Decimal { scale: 2, @@ -818,8 +895,7 @@ mod tests { let left = cast(col("c1"), DataType::Decimal128(28, 3)); let expr = left.eq(lit(ScalarValue::Decimal128(Some(100000), 28, 3))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); // we must use the big-endian when encode the i128 to bytes or vec[u8]. let rgm1 = get_row_group_meta_data( &schema_descr, @@ -863,6 +939,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2, rgm3], None, @@ -874,8 +951,11 @@ mod tests { // BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2) // the type of parquet is decimal(18,2) - let schema = - Schema::new(vec![Field::new("c1", DataType::Decimal128(18, 2), false)]); + let schema = Arc::new(Schema::new(vec![Field::new( + "c1", + DataType::Decimal128(18, 2), + false, + )])); let field = PrimitiveTypeField::new("c1", PhysicalType::BYTE_ARRAY) .with_logical_type(LogicalType::Decimal { scale: 2, @@ -889,8 +969,7 @@ mod tests { let left = cast(col("c1"), DataType::Decimal128(28, 3)); let expr = left.eq(lit(ScalarValue::Decimal128(Some(100000), 28, 3))); let expr = logical2physical(&expr, &schema); - let pruning_predicate = - PruningPredicate::try_new(expr, Arc::new(schema)).unwrap(); + let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); // we must use the big-endian when encode the i128 to bytes or vec[u8]. let rgm1 = get_row_group_meta_data( &schema_descr, @@ -923,6 +1002,7 @@ mod tests { let metrics = parquet_file_metrics(); assert_eq!( prune_row_groups_by_statistics( + &schema, &schema_descr, &[rgm1, rgm2, rgm3], None, diff --git a/datafusion/sqllogictest/test_files/schema_evolution.slt b/datafusion/sqllogictest/test_files/schema_evolution.slt new file mode 100644 index 000000000000..36d54159e24d --- /dev/null +++ b/datafusion/sqllogictest/test_files/schema_evolution.slt @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# Tests for schema evolution -- reading +# data from different files with different schemas +########## + + +statement ok +CREATE EXTERNAL TABLE parquet_table(a varchar, b int, c float) STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution/parquet_table/'; + +# File1 has only columns a and b +statement ok +COPY ( + SELECT column1 as a, column2 as b + FROM ( VALUES ('foo', 1), ('foo', 2), ('foo', 3) ) + ) TO 'test_files/scratch/schema_evolution/parquet_table/1.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + + +# File2 has only b +statement ok +COPY ( + SELECT column1 as b + FROM ( VALUES (10) ) + ) TO 'test_files/scratch/schema_evolution/parquet_table/2.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + +# File3 has a column from 'z' which does not appear in the table +# but also values from a which do appear in the table +statement ok +COPY ( + SELECT column1 as z, column2 as a + FROM ( VALUES ('bar', 'foo'), ('blarg', 'foo') ) + ) TO 'test_files/scratch/schema_evolution/parquet_table/3.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + +# File4 has data for b and a (reversed) and d +statement ok +COPY ( + SELECT column1 as b, column2 as a, column3 as c + FROM ( VALUES (100, 'foo', 10.5), (200, 'foo', 12.6), (300, 'bzz', 13.7) ) + ) TO 'test_files/scratch/schema_evolution/parquet_table/4.parquet' +(FORMAT PARQUET, SINGLE_FILE_OUTPUT true); + +# The logical distribution of `a`, `b` and `c` in the files is like this: +# +## File1: +# foo 1 NULL +# foo 2 NULL +# foo 3 NULL +# +## File2: +# NULL 10 NULL +# +## File3: +# foo NULL NULL +# foo NULL NULL +# +## File4: +# foo 100 10.5 +# foo 200 12.6 +# bzz 300 13.7 + +# Show all the data +query TIR rowsort +select * from parquet_table; +---- +NULL 10 NULL +bzz 300 13.7 +foo 1 NULL +foo 100 10.5 +foo 2 NULL +foo 200 12.6 +foo 3 NULL +foo NULL NULL +foo NULL NULL + +# Should see all 7 rows that have 'a=foo' +query TIR rowsort +select * from parquet_table where a = 'foo'; +---- +foo 1 NULL +foo 100 10.5 +foo 2 NULL +foo 200 12.6 +foo 3 NULL +foo NULL NULL +foo NULL NULL + +query TIR rowsort +select * from parquet_table where a != 'foo'; +---- +bzz 300 13.7 + +# this should produce at least one row +query TIR rowsort +select * from parquet_table where a is NULL; +---- +NULL 10 NULL + +query TIR rowsort +select * from parquet_table where b > 5; +---- +NULL 10 NULL +bzz 300 13.7 +foo 100 10.5 +foo 200 12.6 + + +query TIR rowsort +select * from parquet_table where b < 150; +---- +NULL 10 NULL +foo 1 NULL +foo 100 10.5 +foo 2 NULL +foo 3 NULL + +query TIR rowsort +select * from parquet_table where c > 11.0; +---- +bzz 300 13.7 +foo 200 12.6 From 1042095211caec2cbd0af93b8f4c8a78dff47259 Mon Sep 17 00:00:00 2001 From: Ashim Sedhain <38435962+asimsedhain@users.noreply.github.com> Date: Thu, 14 Dec 2023 10:40:53 -0600 Subject: [PATCH 19/22] feat: improve string statistics display (#8535) GH-8464 --- datafusion-cli/src/functions.rs | 77 ++++++++++++++++++++++----------- datafusion-cli/src/main.rs | 24 ++++++++++ 2 files changed, 75 insertions(+), 26 deletions(-) diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 24f3399ee2be..f8d9ed238be4 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -31,6 +31,7 @@ use datafusion::logical_expr::Expr; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::ExecutionPlan; use datafusion::scalar::ScalarValue; +use parquet::basic::ConvertedType; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use parquet::file::statistics::Statistics; @@ -246,6 +247,52 @@ impl TableProvider for ParquetMetadataTable { } } +fn convert_parquet_statistics( + value: &Statistics, + converted_type: ConvertedType, +) -> (String, String) { + match (value, converted_type) { + (Statistics::Boolean(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::Int32(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::Int64(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::Int96(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::Float(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::Double(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::ByteArray(val), ConvertedType::UTF8) => { + let min_bytes = val.min(); + let max_bytes = val.max(); + let min = min_bytes + .as_utf8() + .map(|v| v.to_string()) + .unwrap_or_else(|_| min_bytes.to_string()); + + let max = max_bytes + .as_utf8() + .map(|v| v.to_string()) + .unwrap_or_else(|_| max_bytes.to_string()); + (min, max) + } + (Statistics::ByteArray(val), _) => (val.min().to_string(), val.max().to_string()), + (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => { + let min_bytes = val.min(); + let max_bytes = val.max(); + let min = min_bytes + .as_utf8() + .map(|v| v.to_string()) + .unwrap_or_else(|_| min_bytes.to_string()); + + let max = max_bytes + .as_utf8() + .map(|v| v.to_string()) + .unwrap_or_else(|_| max_bytes.to_string()); + (min, max) + } + (Statistics::FixedLenByteArray(val), _) => { + (val.min().to_string(), val.max().to_string()) + } + } +} + pub struct ParquetMetadataFunc {} impl TableFunctionImpl for ParquetMetadataFunc { @@ -326,34 +373,12 @@ impl TableFunctionImpl for ParquetMetadataFunc { num_values_arr.push(column.num_values()); path_in_schema_arr.push(column.column_path().to_string()); type_arr.push(column.column_type().to_string()); + let converted_type = column.column_descr().converted_type(); + if let Some(s) = column.statistics() { let (min_val, max_val) = if s.has_min_max_set() { - let (min_val, max_val) = match s { - Statistics::Boolean(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::Int32(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::Int64(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::Int96(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::Float(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::Double(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::ByteArray(val) => { - (val.min().to_string(), val.max().to_string()) - } - Statistics::FixedLenByteArray(val) => { - (val.min().to_string(), val.max().to_string()) - } - }; + let (min_val, max_val) = + convert_parquet_statistics(s, converted_type); (Some(min_val), Some(max_val)) } else { (None, None) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 8b1a9816afc0..8b74a797b57b 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -420,4 +420,28 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_parquet_metadata_works_with_strings() -> Result<(), DataFusionError> { + let ctx = SessionContext::new(); + ctx.register_udtf("parquet_metadata", Arc::new(ParquetMetadataFunc {})); + + // input with string columns + let sql = + "SELECT * FROM parquet_metadata('../parquet-testing/data/data_index_bloom_encoding_stats.parquet')"; + let df = ctx.sql(sql).await?; + let rbs = df.collect().await?; + + let excepted = [ + +"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", +"| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |", +"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+", +"| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | \"String\" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 |", +"+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+" + ]; + assert_batches_eq!(excepted, &rbs); + + Ok(()) + } } From a971f1e7e379f5efe9fa3a5839c36ca2d797e201 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 14 Dec 2023 17:49:56 +0000 Subject: [PATCH 20/22] Defer file creation to write (#8539) * Defer file creation to write * Format * Remove INSERT_MODE * Format * Add ticket link --- datafusion/core/src/datasource/listing/url.rs | 1 + .../src/datasource/listing_table_factory.rs | 17 +++++------------ datafusion/core/src/datasource/stream.rs | 10 ++++++++-- datafusion/core/src/physical_planner.rs | 6 +----- .../test_files/insert_to_external.slt | 18 +++++++++--------- docs/source/user-guide/sql/write_options.md | 8 +++----- 6 files changed, 27 insertions(+), 33 deletions(-) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 979ed9e975c4..3ca7864f7f9e 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -116,6 +116,7 @@ impl ListingTableUrl { /// Get object store for specified input_url /// if input_url is actually not a url, we assume it is a local file path /// if we have a local path, create it if not exists so ListingTableUrl::parse works + #[deprecated(note = "Use parse")] pub fn parse_create_local_if_not_exists( s: impl AsRef, is_directory: bool, diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 96436306c641..a9d0c3a0099e 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -148,19 +148,18 @@ impl TableProviderFactory for ListingTableFactory { .unwrap_or(false) }; - let create_local_path = statement_options - .take_bool_option("create_local_path")? - .unwrap_or(false); let single_file = statement_options .take_bool_option("single_file")? .unwrap_or(false); - // Backwards compatibility + // Backwards compatibility (#8547) if let Some(s) = statement_options.take_str_option("insert_mode") { if !s.eq_ignore_ascii_case("append_new_files") { - return plan_err!("Unknown or unsupported insert mode {s}. Only append_to_file supported"); + return plan_err!("Unknown or unsupported insert mode {s}. Only append_new_files supported"); } } + statement_options.take_bool_option("create_local_path")?; + let file_type = file_format.file_type(); // Use remaining options and session state to build FileTypeWriterOptions @@ -199,13 +198,7 @@ impl TableProviderFactory for ListingTableFactory { FileType::AVRO => file_type_writer_options, }; - let table_path = match create_local_path { - true => ListingTableUrl::parse_create_local_if_not_exists( - &cmd.location, - !single_file, - ), - false => ListingTableUrl::parse(&cmd.location), - }?; + let table_path = ListingTableUrl::parse(&cmd.location)?; let options = ListingOptions::new(file_format) .with_collect_stat(state.config().collect_statistics()) diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index e7512499eb9d..b9b45a6c7470 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -179,7 +179,10 @@ impl StreamConfig { match &self.encoding { StreamEncoding::Csv => { let header = self.header && !self.location.exists(); - let file = OpenOptions::new().append(true).open(&self.location)?; + let file = OpenOptions::new() + .create(true) + .append(true) + .open(&self.location)?; let writer = arrow::csv::WriterBuilder::new() .with_header(header) .build(file); @@ -187,7 +190,10 @@ impl StreamConfig { Ok(Box::new(writer)) } StreamEncoding::Json => { - let file = OpenOptions::new().append(true).open(&self.location)?; + let file = OpenOptions::new() + .create(true) + .append(true) + .open(&self.location)?; Ok(Box::new(arrow::json::LineDelimitedWriter::new(file))) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index ab38b3ec6d2f..93f0b31e5234 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -571,11 +571,7 @@ impl DefaultPhysicalPlanner { copy_options, }) => { let input_exec = self.create_initial_plan(input, session_state).await?; - - // TODO: make this behavior configurable via options (should copy to create path/file as needed?) - // TODO: add additional configurable options for if existing files should be overwritten or - // appended to - let parsed_url = ListingTableUrl::parse_create_local_if_not_exists(output_url, !*single_file_output)?; + let parsed_url = ListingTableUrl::parse(output_url)?; let object_store_url = parsed_url.object_store(); let schema: Schema = (**input.schema()).clone().into(); diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 85c2db7faaf6..cdaf0bb64339 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -57,7 +57,7 @@ CREATE EXTERNAL TABLE dictionary_encoded_parquet_partitioned( b varchar, ) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/parquet_types_partitioned' +LOCATION 'test_files/scratch/insert_to_external/parquet_types_partitioned/' PARTITIONED BY (b) OPTIONS( create_local_path 'true', @@ -292,7 +292,7 @@ statement ok CREATE EXTERNAL TABLE directory_test(a bigint, b bigint) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q0' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q0/' OPTIONS( create_local_path 'true', ); @@ -312,7 +312,7 @@ statement ok CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q1' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q1/' OPTIONS (create_local_path 'true'); query TT @@ -378,7 +378,7 @@ statement ok CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q2' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q2/' OPTIONS (create_local_path 'true'); query TT @@ -423,7 +423,7 @@ statement ok CREATE EXTERNAL TABLE table_without_values(c1 varchar NULL) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q3' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q3/' OPTIONS (create_local_path 'true'); # verify that the sort order of the insert query is maintained into the @@ -462,7 +462,7 @@ statement ok CREATE EXTERNAL TABLE table_without_values(id BIGINT, name varchar) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q4' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q4/' OPTIONS (create_local_path 'true'); query IT @@ -505,7 +505,7 @@ statement ok CREATE EXTERNAL TABLE table_without_values(field1 BIGINT NOT NULL, field2 BIGINT NULL) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q5' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q5/' OPTIONS (create_local_path 'true'); query II @@ -555,7 +555,7 @@ CREATE EXTERNAL TABLE test_column_defaults( d text default lower('DEFAULT_TEXT'), e timestamp default now() ) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q6' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q6/' OPTIONS (create_local_path 'true'); # fill in all column values @@ -608,5 +608,5 @@ CREATE EXTERNAL TABLE test_column_defaults( a int, b int default a+1 ) STORED AS parquet -LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q7' +LOCATION 'test_files/scratch/insert_to_external/external_parquet_table_q7/' OPTIONS (create_local_path 'true'); diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md index 941484e84efd..94adee960996 100644 --- a/docs/source/user-guide/sql/write_options.md +++ b/docs/source/user-guide/sql/write_options.md @@ -78,11 +78,9 @@ The following special options are specific to the `COPY` command. The following special options are specific to creating an external table. -| Option | Description | Default Value | -| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------- | -| SINGLE_FILE | If true, indicates that this external table is backed by a single file. INSERT INTO queries will append to this file. | false | -| CREATE_LOCAL_PATH | If true, the folder or file backing this table will be created on the local file system if it does not already exist when running INSERT INTO queries. | false | -| INSERT_MODE | Determines if INSERT INTO queries should append to existing files or append new files to an existing directory. Valid values are append_to_file, append_new_files, and error. Note that "error" will block inserting data into this table. | CSV and JSON default to append_to_file. Parquet defaults to append_new_files | +| Option | Description | Default Value | +| ----------- | --------------------------------------------------------------------------------------------------------------------- | ------------- | +| SINGLE_FILE | If true, indicates that this external table is backed by a single file. INSERT INTO queries will append to this file. | false | ### JSON Format Specific Options From efa7b3421a4b05d939e92b94554f6f7fb2164d71 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 14 Dec 2023 13:52:25 -0500 Subject: [PATCH 21/22] Minor: Improve error handling in sqllogictest runner (#8544) --- datafusion/sqllogictest/bin/sqllogictests.rs | 53 +++++++++++++------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index 484677d58e79..aeb1cc4ec919 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -26,7 +26,7 @@ use futures::stream::StreamExt; use log::info; use sqllogictest::strict_column_validator; -use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::{exec_datafusion_err, exec_err, DataFusionError, Result}; const TEST_DIRECTORY: &str = "test_files/"; const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_"; @@ -84,7 +84,7 @@ async fn run_tests() -> Result<()> { // Doing so is safe because each slt file runs with its own // `SessionContext` and should not have side effects (like // modifying shared state like `/tmp/`) - let errors: Vec<_> = futures::stream::iter(read_test_files(&options)) + let errors: Vec<_> = futures::stream::iter(read_test_files(&options)?) .map(|test_file| { tokio::task::spawn(async move { println!("Running {:?}", test_file.relative_path); @@ -247,30 +247,45 @@ impl TestFile { } } -fn read_test_files<'a>(options: &'a Options) -> Box + 'a> { - Box::new( - read_dir_recursive(TEST_DIRECTORY) +fn read_test_files<'a>( + options: &'a Options, +) -> Result + 'a>> { + Ok(Box::new( + read_dir_recursive(TEST_DIRECTORY)? + .into_iter() .map(TestFile::new) .filter(|f| options.check_test_file(&f.relative_path)) .filter(|f| f.is_slt_file()) .filter(|f| f.check_tpch(options)) .filter(|f| options.check_pg_compat_file(f.path.as_path())), - ) + )) } -fn read_dir_recursive>(path: P) -> Box> { - Box::new( - std::fs::read_dir(path) - .expect("Readable directory") - .map(|path| path.expect("Readable entry").path()) - .flat_map(|path| { - if path.is_dir() { - read_dir_recursive(path) - } else { - Box::new(std::iter::once(path)) - } - }), - ) +fn read_dir_recursive>(path: P) -> Result> { + let mut dst = vec![]; + read_dir_recursive_impl(&mut dst, path.as_ref())?; + Ok(dst) +} + +/// Append all paths recursively to dst +fn read_dir_recursive_impl(dst: &mut Vec, path: &Path) -> Result<()> { + let entries = std::fs::read_dir(path) + .map_err(|e| exec_datafusion_err!("Error reading directory {path:?}: {e}"))?; + for entry in entries { + let path = entry + .map_err(|e| { + exec_datafusion_err!("Error reading entry in directory {path:?}: {e}") + })? + .path(); + + if path.is_dir() { + read_dir_recursive_impl(dst, &path)?; + } else { + dst.push(path); + } + } + + Ok(()) } /// Parsed command line options From e6e84e15df09dc5caa713ccd10c67586afa6b50a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 14 Dec 2023 12:15:03 -0700 Subject: [PATCH 22/22] regenerate changelog --- dev/changelog/34.0.0.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dev/changelog/34.0.0.md b/dev/changelog/34.0.0.md index 8b8933017cfb..c5526f60531c 100644 --- a/dev/changelog/34.0.0.md +++ b/dev/changelog/34.0.0.md @@ -54,6 +54,7 @@ - feat: customize column default values for external tables [#8415](https://github.com/apache/arrow-datafusion/pull/8415) (jonahgao) - feat: Support `array_sort`(`list_sort`) [#8279](https://github.com/apache/arrow-datafusion/pull/8279) (Asura7969) - feat: support `InterleaveExecNode` in the proto [#8460](https://github.com/apache/arrow-datafusion/pull/8460) (liukun4515) +- feat: improve string statistics display in datafusion-cli `parquet_metadata` function [#8535](https://github.com/apache/arrow-datafusion/pull/8535) (asimsedhain) **Fixed bugs:** @@ -66,11 +67,15 @@ - fix: RANGE frame for corner cases with empty ORDER BY clause should be treated as constant sort [#8445](https://github.com/apache/arrow-datafusion/pull/8445) (viirya) - fix: don't unifies projection if expr is non-trival [#8454](https://github.com/apache/arrow-datafusion/pull/8454) (haohuaijin) - fix: support uppercase when parsing `Interval` [#8478](https://github.com/apache/arrow-datafusion/pull/8478) (QuenKar) +- fix: incorrect set preserve_partitioning in SortExec [#8485](https://github.com/apache/arrow-datafusion/pull/8485) (haohuaijin) +- fix: Pull stats in `IdentVisitor`/`GraphvizVisitor` only when requested [#8514](https://github.com/apache/arrow-datafusion/pull/8514) (vrongmeal) +- fix: volatile expressions should not be target of common subexpt elimination [#8520](https://github.com/apache/arrow-datafusion/pull/8520) (viirya) **Documentation updates:** - Library Guide: Add Using the DataFrame API [#8319](https://github.com/apache/arrow-datafusion/pull/8319) (Veeupup) - Minor: Add installation link to README.md [#8389](https://github.com/apache/arrow-datafusion/pull/8389) (Weijun-H) +- Prepare version 34.0.0 [#8508](https://github.com/apache/arrow-datafusion/pull/8508) (andygrove) **Merged pull requests:** @@ -245,3 +250,24 @@ - Minor: Improve comments in EnforceDistribution tests [#8474](https://github.com/apache/arrow-datafusion/pull/8474) (alamb) - fix: support uppercase when parsing `Interval` [#8478](https://github.com/apache/arrow-datafusion/pull/8478) (QuenKar) - Better Equivalence (ordering and exact equivalence) Propagation through ProjectionExec [#8484](https://github.com/apache/arrow-datafusion/pull/8484) (mustafasrepo) +- Add `today` alias for `current_date` [#8423](https://github.com/apache/arrow-datafusion/pull/8423) (smallzhongfeng) +- Minor: remove useless clone in `array_expression` [#8495](https://github.com/apache/arrow-datafusion/pull/8495) (Weijun-H) +- fix: incorrect set preserve_partitioning in SortExec [#8485](https://github.com/apache/arrow-datafusion/pull/8485) (haohuaijin) +- Explicitly mark parquet for tests in datafusion-common [#8497](https://github.com/apache/arrow-datafusion/pull/8497) (Dennis40816) +- Minor/Doc: Clarify DataFrame::write_table Documentation [#8519](https://github.com/apache/arrow-datafusion/pull/8519) (devinjdangelo) +- fix: Pull stats in `IdentVisitor`/`GraphvizVisitor` only when requested [#8514](https://github.com/apache/arrow-datafusion/pull/8514) (vrongmeal) +- Change display of RepartitionExec from SortPreservingRepartitionExec to RepartitionExec preserve_order=true [#8521](https://github.com/apache/arrow-datafusion/pull/8521) (JacobOgle) +- Fix `DataFrame::cache` errors with `Plan("Mismatch between schema and batches")` [#8510](https://github.com/apache/arrow-datafusion/pull/8510) (Asura7969) +- Minor: update pbjson_dependency [#8470](https://github.com/apache/arrow-datafusion/pull/8470) (alamb) +- Minor: Update prost-derive dependency [#8471](https://github.com/apache/arrow-datafusion/pull/8471) (alamb) +- Minor/Doc: Add DataFrame::write_table to DataFrame user guide [#8527](https://github.com/apache/arrow-datafusion/pull/8527) (devinjdangelo) +- Minor: Add repartition_file.slt end to end test for repartitioning files, and supporting tweaks [#8505](https://github.com/apache/arrow-datafusion/pull/8505) (alamb) +- Prepare version 34.0.0 [#8508](https://github.com/apache/arrow-datafusion/pull/8508) (andygrove) +- refactor: use ExprBuilder to consume substrait expr and use macro to generate error [#8515](https://github.com/apache/arrow-datafusion/pull/8515) (waynexia) +- [MINOR]: Make some slt tests deterministic [#8525](https://github.com/apache/arrow-datafusion/pull/8525) (mustafasrepo) +- fix: volatile expressions should not be target of common subexpt elimination [#8520](https://github.com/apache/arrow-datafusion/pull/8520) (viirya) +- Minor: Add LakeSoul to the list of Known Users [#8536](https://github.com/apache/arrow-datafusion/pull/8536) (xuchen-plus) +- Fix regression with Incorrect results when reading parquet files with different schemas and statistics [#8533](https://github.com/apache/arrow-datafusion/pull/8533) (alamb) +- feat: improve string statistics display in datafusion-cli `parquet_metadata` function [#8535](https://github.com/apache/arrow-datafusion/pull/8535) (asimsedhain) +- Defer file creation to write [#8539](https://github.com/apache/arrow-datafusion/pull/8539) (tustvold) +- Minor: Improve error handling in sqllogictest runner [#8544](https://github.com/apache/arrow-datafusion/pull/8544) (alamb)