-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add configurable normalization for configuration options and preserve case for S3 paths #13576
base: main
Are you sure you want to change the base?
Changes from all commits
2887bb9
813d634
c3de620
0e11d36
cb7da8c
7c2b3fe
0574ab8
2045495
06c013d
967f79f
aea717d
64ebbd5
947d1c2
66c3cd5
9a35746
6ccbb02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
use std::any::Any; | ||
use std::collections::{BTreeMap, HashMap}; | ||
use std::error::Error; | ||
use std::fmt::{self, Display}; | ||
use std::str::FromStr; | ||
|
||
|
@@ -29,7 +30,9 @@ use crate::{DataFusionError, Result}; | |
|
||
/// A macro that wraps a configuration struct and automatically derives | ||
/// [`Default`] and [`ConfigField`] for it, allowing it to be used | ||
/// in the [`ConfigOptions`] configuration tree | ||
/// in the [`ConfigOptions`] configuration tree. | ||
/// | ||
/// `transform` is used to normalize values before parsing. | ||
/// | ||
/// For example, | ||
/// | ||
|
@@ -38,7 +41,7 @@ use crate::{DataFusionError, Result}; | |
/// /// Amazing config | ||
/// pub struct MyConfig { | ||
/// /// Field 1 doc | ||
/// field1: String, default = "".to_string() | ||
/// field1: String, transform = str::to_lowercase, default = "".to_string() | ||
/// | ||
/// /// Field 2 doc | ||
/// field2: usize, default = 232 | ||
|
@@ -67,9 +70,12 @@ use crate::{DataFusionError, Result}; | |
/// fn set(&mut self, key: &str, value: &str) -> Result<()> { | ||
/// let (key, rem) = key.split_once('.').unwrap_or((key, "")); | ||
/// match key { | ||
/// "field1" => self.field1.set(rem, value), | ||
/// "field2" => self.field2.set(rem, value), | ||
/// "field3" => self.field3.set(rem, value), | ||
/// "field1" => { | ||
/// let value = str::to_lowercase(value); | ||
/// self.field1.set(rem, value.as_ref()) | ||
/// }, | ||
/// "field2" => self.field2.set(rem, value.as_ref()), | ||
/// "field3" => self.field3.set(rem, value.as_ref()), | ||
/// _ => _internal_err!( | ||
/// "Config value \"{}\" not found on MyConfig", | ||
/// key | ||
|
@@ -102,15 +108,14 @@ use crate::{DataFusionError, Result}; | |
/// ``` | ||
/// | ||
/// NB: Misplaced commas may result in nonsensical errors | ||
/// | ||
#[macro_export] | ||
macro_rules! config_namespace { | ||
( | ||
$(#[doc = $struct_d:tt])* | ||
$vis:vis struct $struct_name:ident { | ||
$( | ||
$(#[doc = $d:tt])* | ||
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr | ||
$field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr | ||
)*$(,)* | ||
} | ||
) => { | ||
|
@@ -127,9 +132,14 @@ macro_rules! config_namespace { | |
impl ConfigField for $struct_name { | ||
fn set(&mut self, key: &str, value: &str) -> Result<()> { | ||
let (key, rem) = key.split_once('.').unwrap_or((key, "")); | ||
|
||
match key { | ||
$( | ||
stringify!($field_name) => self.$field_name.set(rem, value), | ||
stringify!($field_name) => { | ||
$(let value = $transform(value);)? | ||
$(log::warn!($warn);)? | ||
self.$field_name.set(rem, value.as_ref()) | ||
}, | ||
)* | ||
_ => return _config_err!( | ||
"Config value \"{}\" not found on {}", key, stringify!($struct_name) | ||
|
@@ -211,12 +221,15 @@ config_namespace! { | |
/// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | ||
pub enable_ident_normalization: bool, default = true | ||
|
||
/// When set to true, SQL parser will normalize options value (convert value to lowercase) | ||
pub enable_options_value_normalization: bool, default = true | ||
/// When set to true, SQL parser will normalize options value (convert value to lowercase). | ||
/// Note that this option is ignored and will be removed in the future. All case-insensitive values | ||
/// are normalized automatically. | ||
pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false | ||
|
||
/// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, | ||
/// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | ||
pub dialect: String, default = "generic".to_string() | ||
// no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive | ||
|
||
/// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but | ||
/// ignore the length. If false, error if a `VARCHAR` with a length is | ||
|
@@ -431,7 +444,7 @@ config_namespace! { | |
/// | ||
/// Note that this default setting is not the same as | ||
/// the default parquet writer setting. | ||
pub compression: Option<String>, default = Some("zstd(3)".into()) | ||
pub compression: Option<String>, transform = str::to_lowercase, default = Some("zstd(3)".into()) | ||
|
||
/// (writing) Sets if dictionary encoding is enabled. If NULL, uses | ||
/// default parquet writer setting | ||
|
@@ -444,7 +457,7 @@ config_namespace! { | |
/// Valid values are: "none", "chunk", and "page" | ||
/// These values are not case sensitive. If NULL, uses | ||
/// default parquet writer setting | ||
pub statistics_enabled: Option<String>, default = Some("page".into()) | ||
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into()) | ||
|
||
/// (writing) Sets max statistics size for any column. If NULL, uses | ||
/// default parquet writer setting | ||
|
@@ -470,7 +483,7 @@ config_namespace! { | |
/// delta_byte_array, rle_dictionary, and byte_stream_split. | ||
/// These values are not case sensitive. If NULL, uses | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this suggests the code reading this value does normalization and that it's no longer needed, can be simplified There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually think this is a good info for users that you assign any values to it - "plain", "PLAIN", "Plain", etc. Also, I think that's what Oleks asked in #13576 (comment) - so maybe let's keep it as is? |
||
/// default parquet writer setting | ||
pub encoding: Option<String>, default = None | ||
pub encoding: Option<String>, transform = str::to_lowercase, default = None | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// (writing) Use any available bloom filters when reading parquet files | ||
pub bloom_filter_on_read: bool, default = true | ||
|
@@ -971,29 +984,45 @@ impl<F: ConfigField + Default> ConfigField for Option<F> { | |
} | ||
} | ||
|
||
fn default_transform<T>(input: &str) -> Result<T> | ||
where | ||
T: FromStr, | ||
<T as FromStr>::Err: Sync + Send + Error + 'static, | ||
{ | ||
input.parse().map_err(|e| { | ||
DataFusionError::Context( | ||
format!( | ||
"Error parsing '{}' as {}", | ||
input, | ||
std::any::type_name::<T>() | ||
), | ||
Box::new(DataFusionError::External(Box::new(e))), | ||
) | ||
}) | ||
} | ||
|
||
#[macro_export] | ||
macro_rules! config_field { | ||
($t:ty) => { | ||
config_field!($t, value => default_transform(value)?); | ||
}; | ||
|
||
($t:ty, $arg:ident => $transform:expr) => { | ||
impl ConfigField for $t { | ||
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { | ||
v.some(key, self, description) | ||
} | ||
|
||
fn set(&mut self, _: &str, value: &str) -> Result<()> { | ||
*self = value.parse().map_err(|e| { | ||
DataFusionError::Context( | ||
format!(concat!("Error parsing {} as ", stringify!($t),), value), | ||
Box::new(DataFusionError::External(Box::new(e))), | ||
) | ||
})?; | ||
fn set(&mut self, _: &str, $arg: &str) -> Result<()> { | ||
*self = $transform; | ||
Ok(()) | ||
} | ||
} | ||
}; | ||
} | ||
|
||
config_field!(String); | ||
config_field!(bool); | ||
config_field!(bool, value => default_transform(value.to_lowercase().as_str())?); | ||
config_field!(usize); | ||
config_field!(f64); | ||
config_field!(u64); | ||
|
@@ -1508,7 +1537,7 @@ macro_rules! config_namespace_with_hashmap { | |
$vis:vis struct $struct_name:ident { | ||
$( | ||
$(#[doc = $d:tt])* | ||
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr | ||
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr | ||
)*$(,)* | ||
} | ||
) => { | ||
|
@@ -1527,7 +1556,10 @@ macro_rules! config_namespace_with_hashmap { | |
let (key, rem) = key.split_once('.').unwrap_or((key, "")); | ||
match key { | ||
$( | ||
stringify!($field_name) => self.$field_name.set(rem, value), | ||
stringify!($field_name) => { | ||
$(let value = $transform(value);)? | ||
self.$field_name.set(rem, value.as_ref()) | ||
}, | ||
)* | ||
_ => _config_err!( | ||
"Config value \"{}\" not found on {}", key, stringify!($struct_name) | ||
|
@@ -1606,7 +1638,7 @@ config_namespace_with_hashmap! { | |
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw. | ||
/// These values are not case-sensitive. If NULL, uses | ||
/// default parquet options | ||
pub compression: Option<String>, default = None | ||
pub compression: Option<String>, transform = str::to_lowercase, default = None | ||
|
||
/// Sets if statistics are enabled for the column | ||
/// Valid values are: "none", "chunk", and "page" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now the
These values are not case sensitive
can be removed