diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 374d85b0440b..e4f7e6c4afae 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -481,4 +481,144 @@ mod tests { "the writer_props should have the same configuration as the session's TableParquetOptions", ); } + + #[test] + fn test_defaults_match() { + // ensure the global settings are the same + let default_table_writer_opts = TableParquetOptions::default(); + let default_parquet_opts = ParquetOptions::default(); + assert_eq!( + default_table_writer_opts.global, + default_parquet_opts, + "should have matching defaults for TableParquetOptions.global and ParquetOptions", + ); + + // confirm the TableParquetOptions::default matches the WriterProperties::default, once both converted to WriterProperties + let default_writer_props = WriterProperties::new(); + let from_datafusion_defaults = + WriterPropertiesBuilder::try_from(&default_table_writer_opts) + .unwrap() + .build(); + + // Expected: how the defaults should not match + assert_ne!( + default_writer_props.created_by(), + from_datafusion_defaults.created_by(), + "should have different created_by sources", + ); + assert!( + default_writer_props.created_by().starts_with("parquet-rs version"), + "should indicate that writer_props defaults came from the extern parquet crate", + ); + assert!( + default_table_writer_opts + .global + .created_by + .starts_with("datafusion version"), + "should indicate that table_parquet_opts defaults came from datafusion", + ); + + // TODO: in a followup PR, start the discussion of which of these defaults should be changed, + // and if the change should be in datafusion's default or the extern parquet's default. + // refer to https://github.com/apache/datafusion/issues/11367 + + // TODO: does not match + assert_eq!( + default_writer_props.compression(&"default".into()), + Compression::UNCOMPRESSED, + "extern parquet's default is None" + ); + assert!( + matches!( + from_datafusion_defaults.compression(&"foo".into()), + Compression::ZSTD(_) + ), + "datafusion's default is zstd" + ); + + // TODO: does not match + assert_eq!( + default_writer_props.data_page_row_count_limit(), + 20_000, + "extern parquet's default data_page_row_count_limit is 20_000" + ); + assert_eq!( + from_datafusion_defaults.data_page_row_count_limit(), + usize::MAX, + "datafusion's default is usize::MAX" + ); + + // TODO: does not match + assert_eq!( + default_writer_props.column_index_truncate_length(), + Some(64), + "extern parquet's default is 64" + ); + assert_eq!( + from_datafusion_defaults.column_index_truncate_length(), + None, + "datafusion's default is None" + ); + + // TODO: does not match + assert!( + default_writer_props.dictionary_enabled(&"default".into()), + "extern parquet's default is true" + ); + assert!( + !from_datafusion_defaults.dictionary_enabled(&"foo".into()), + "datafusion's default is false" + ); + + // TODO: matches once create WriterProps, but only due to parquet's override + assert_eq!( + default_writer_props.statistics_enabled(&"default".into()), + EnabledStatistics::Page, + "extern parquet's default is page" + ); + assert_eq!( + default_table_writer_opts.global.statistics_enabled, None, + "datafusion's has no default" + ); + assert_eq!( + from_datafusion_defaults.statistics_enabled(&"foo".into()), + EnabledStatistics::Page, + "should see the extern parquet's default over-riding datafusion's None", + ); + + // TODO: matches once create WriterProps, but only due to parquet's override + assert_eq!( + default_writer_props.max_statistics_size(&"default".into()), + 4096, + "extern parquet's default is 4096" + ); + assert_eq!( + default_table_writer_opts.global.max_statistics_size, None, + "datafusion's has no default" + ); + assert_eq!( + default_writer_props.max_statistics_size(&"default".into()), + 4096, + "should see the extern parquet's default over-riding datafusion's None", + ); + + // TODO: temporarily set the extern parquet's defaults back to the datafusion defaults. + let mut from_extern_parquet = + session_config_from_writer_props(&default_writer_props); + from_extern_parquet.global.compression = Some("zstd(3)".into()); + from_extern_parquet.global.data_page_row_count_limit = usize::MAX; + from_extern_parquet.global.column_index_truncate_length = None; + from_extern_parquet.global.dictionary_enabled = None; + from_extern_parquet.global.statistics_enabled = None; + from_extern_parquet.global.max_statistics_size = None; + + // Expected: the remaining should match + let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different + from_extern_parquet.global.created_by = same_created_by; // we expect these to be different + assert_eq!( + default_table_writer_opts, + from_extern_parquet, + "the writer_props should have the same configuration as the session's TableParquetOptions", + ); + } }