diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 33a90017bd7e..32b7213d952f 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -108,38 +108,51 @@ use crate::{DataFusionError, Result}; /// ``` /// /// NB: Misplaced commas may result in nonsensical errors -#[macro_export] macro_rules! config_namespace { ( - $(#[doc = $struct_d:tt])* - $vis:vis struct $struct_name:ident { - $( - $(#[doc = $d:tt])* - $field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr - )*$(,)* - } + $(#[doc = $struct_d:tt])* // Struct-level documentation attributes + $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute + $(#[allow($($struct_de:tt)*)])? + $vis:vis struct $struct_name:ident { + $( + $(#[doc = $d:tt])* // Field-level documentation attributes + $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute + $(#[allow($($field_de:tt)*)])? + $field_vis:vis $field_name:ident : $field_type:ty, + $(warn = $warn:expr,)? + $(transform = $transform:expr,)? + default = $default:expr + )*$(,)* + } ) => { - - $(#[doc = $struct_d])* + $(#[doc = $struct_d])* // Apply struct documentation + $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation + $(#[allow($($struct_de)*)])? #[derive(Debug, Clone, PartialEq)] - $vis struct $struct_name{ + $vis struct $struct_name { $( - $(#[doc = $d])* - $field_vis $field_name : $field_type, + $(#[doc = $d])* // Apply field documentation + $(#[deprecated($($field_depr)*)])? // Apply field deprecation + $(#[allow($($field_de)*)])? + $field_vis $field_name: $field_type, )* } impl ConfigField for $struct_name { fn set(&mut self, key: &str, value: &str) -> Result<()> { let (key, rem) = key.split_once('.').unwrap_or((key, "")); - match key { $( - stringify!($field_name) => { - $(let value = $transform(value);)? - $(log::warn!($warn);)? - self.$field_name.set(rem, value.as_ref()) - }, + stringify!($field_name) => { + // Safely apply deprecated attribute if present + // $(#[allow(deprecated)])? + { + $(let value = $transform(value);)? // Apply transformation if specified + $(log::warn!($warn);)? // Log warning if specified + #[allow(deprecated)] + self.$field_name.set(rem, value.as_ref()) + } + }, )* _ => return _config_err!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) @@ -149,15 +162,16 @@ macro_rules! config_namespace { fn visit(&self, v: &mut V, key_prefix: &str, _description: &'static str) { $( - let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); - let desc = concat!($($d),*).trim(); - self.$field_name.visit(v, key.as_str(), desc); + let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); + let desc = concat!($($d),*).trim(); + #[allow(deprecated)] + self.$field_name.visit(v, key.as_str(), desc); )* } } - impl Default for $struct_name { fn default() -> Self { + #[allow(deprecated)] Self { $($field_name: $default),* } @@ -467,6 +481,9 @@ config_namespace! { /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = Some(4096) /// (writing) Target maximum number of rows in each row group (defaults to 1M @@ -1598,19 +1615,23 @@ impl ConfigField for TableParquetOptions { macro_rules! config_namespace_with_hashmap { ( $(#[doc = $struct_d:tt])* + $(#[deprecated($($struct_depr:tt)*)])? // Optional struct-level deprecated attribute $vis:vis struct $struct_name:ident { $( $(#[doc = $d:tt])* + $(#[deprecated($($field_depr:tt)*)])? // Optional field-level deprecated attribute $field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr )*$(,)* } ) => { $(#[doc = $struct_d])* + $(#[deprecated($($struct_depr)*)])? // Apply struct deprecation #[derive(Debug, Clone, PartialEq)] $vis struct $struct_name{ $( $(#[doc = $d])* + $(#[deprecated($($field_depr)*)])? // Apply field deprecation $field_vis $field_name : $field_type, )* } @@ -1621,6 +1642,8 @@ macro_rules! config_namespace_with_hashmap { match key { $( stringify!($field_name) => { + // Handle deprecated fields + #[allow(deprecated)] // Allow deprecated fields $(let value = $transform(value);)? self.$field_name.set(rem, value.as_ref()) }, @@ -1635,6 +1658,8 @@ macro_rules! config_namespace_with_hashmap { $( let key = format!(concat!("{}.", stringify!($field_name)), key_prefix); let desc = concat!($($d),*).trim(); + // Handle deprecated fields + #[allow(deprecated)] self.$field_name.visit(v, key.as_str(), desc); )* } @@ -1642,6 +1667,7 @@ macro_rules! config_namespace_with_hashmap { impl Default for $struct_name { fn default() -> Self { + #[allow(deprecated)] Self { $($field_name: $default),* } @@ -1653,7 +1679,7 @@ macro_rules! config_namespace_with_hashmap { let parts: Vec<&str> = key.splitn(2, "::").collect(); match parts.as_slice() { [inner_key, hashmap_key] => { - // Get or create the ColumnOptions for the specified column + // Get or create the struct for the specified key let inner_value = self .entry((*hashmap_key).to_owned()) .or_insert_with($struct_name::default); @@ -1669,6 +1695,7 @@ macro_rules! config_namespace_with_hashmap { $( let key = format!("{}.{field}::{}", key_prefix, column_name, field = stringify!($field_name)); let desc = concat!($($d),*).trim(); + #[allow(deprecated)] col_options.$field_name.visit(v, key.as_str(), desc); )* } @@ -1720,6 +1747,9 @@ config_namespace_with_hashmap! { /// Sets max statistics size for the column path. If NULL, uses /// default parquet options + /// max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[deprecated(since = "45.0.0", note = "Setting does not do anything")] pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 3f06e11bb376..6a717d3c0c60 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -26,6 +26,7 @@ use crate::{ }; use arrow_schema::Schema; +// TODO: handle once deprecated #[allow(deprecated)] use parquet::{ arrow::ARROW_SCHEMA_META_KEY, @@ -157,6 +158,9 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } + // max_statistics_size is deprecated, currently it is not being used + // TODO: remove once deprecated + #[allow(deprecated)] if let Some(max_statistics_size) = options.max_statistics_size { builder = { #[allow(deprecated)] @@ -202,6 +206,7 @@ impl ParquetOptions { /// /// Note that this method does not include the key_value_metadata from [`TableParquetOptions`]. pub fn into_writer_properties_builder(&self) -> Result { + #[allow(deprecated)] let ParquetOptions { data_pagesize_limit, write_batch_size, @@ -452,6 +457,7 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), @@ -472,6 +478,7 @@ mod tests { "1.0" }; + #[allow(deprecated)] // max_statistics_size ParquetOptions { data_pagesize_limit: 42, write_batch_size: 42, @@ -515,6 +522,7 @@ mod tests { ) -> ParquetColumnOptions { let bloom_filter_default_props = props.bloom_filter_properties(&col); + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { bloom_filter_enabled: Some(bloom_filter_default_props.is_some()), encoding: props.encoding(&col).map(|s| s.to_string()), @@ -535,7 +543,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - #[allow(deprecated)] max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -569,6 +576,7 @@ mod tests { HashMap::from([(COL_NAME.into(), configured_col_props)]) }; + #[allow(deprecated)] // max_statistics_size TableParquetOptions { global: ParquetOptions { // global options diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 73ccbf935b2f..d88186fbf366 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -936,6 +936,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { fn try_from( value: &protobuf::ParquetOptions, ) -> datafusion_common::Result { + #[allow(deprecated)] // max_statistics_size Ok(ParquetOptions { enable_page_index: value.enable_page_index, pruning: value.pruning, @@ -1013,6 +1014,7 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { fn try_from( value: &protobuf::ParquetColumnOptions, ) -> datafusion_common::Result { + #[allow(deprecated)] // max_statistics_size Ok(ParquetColumnOptions { compression: value.compression_opt.clone().map(|opt| match opt { protobuf::parquet_column_options::CompressionOpt::Compression(v) => Some(v), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 83296ca76f0f..88bbbfd60426 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -820,6 +820,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), @@ -858,6 +859,7 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), + #[allow(deprecated)] max_statistics_size_opt: value.max_statistics_size.map(|v| { protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( v as u32, diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 772e6d23426a..237e6d2a7137 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -362,6 +362,7 @@ impl TableParquetOptionsProto { }; let column_specific_options = global_options.column_specific_options; + #[allow(deprecated)] // max_statistics_size TableParquetOptionsProto { global: Some(ParquetOptionsProto { enable_page_index: global_options.global.enable_page_index, @@ -455,6 +456,7 @@ impl TableParquetOptionsProto { impl From<&ParquetOptionsProto> for ParquetOptions { fn from(proto: &ParquetOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetOptions { enable_page_index: proto.enable_page_index, pruning: proto.pruning, @@ -509,6 +511,7 @@ impl From<&ParquetOptionsProto> for ParquetOptions { impl From for ParquetColumnOptions { fn from(proto: ParquetColumnOptionsProto) -> Self { + #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { bloom_filter_enabled: proto.bloom_filter_enabled_opt.map( |parquet_column_options::BloomFilterEnabledOpt::BloomFilterEnabled(v)| v, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 46618b32d77a..4653df400080 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -301,7 +301,7 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting +datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 1c39064c15d7..dd9ce759b28a 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -66,7 +66,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 44.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length |