-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add configurable normalization for configuration options and preserve case for S3 paths #13576
base: main
Are you sure you want to change the base?
Changes from 9 commits
2887bb9
813d634
c3de620
0e11d36
cb7da8c
7c2b3fe
0574ab8
2045495
06c013d
967f79f
aea717d
64ebbd5
947d1c2
66c3cd5
9a35746
6ccbb02
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
use std::any::Any; | ||
use std::collections::{BTreeMap, HashMap}; | ||
use std::error::Error; | ||
use std::fmt::{self, Display}; | ||
use std::str::FromStr; | ||
|
||
|
@@ -38,7 +39,7 @@ use crate::{DataFusionError, Result}; | |
/// /// Amazing config | ||
/// pub struct MyConfig { | ||
/// /// Field 1 doc | ||
/// field1: String, default = "".to_string() | ||
/// field1: String, transform = str::to_lowercase, default = "".to_string() | ||
/// | ||
/// /// Field 2 doc | ||
/// field2: usize, default = 232 | ||
|
@@ -67,9 +68,12 @@ use crate::{DataFusionError, Result}; | |
/// fn set(&mut self, key: &str, value: &str) -> Result<()> { | ||
/// let (key, rem) = key.split_once('.').unwrap_or((key, "")); | ||
/// match key { | ||
/// "field1" => self.field1.set(rem, value), | ||
/// "field2" => self.field2.set(rem, value), | ||
/// "field3" => self.field3.set(rem, value), | ||
/// "field1" => { | ||
/// let value = str::to_lowercase(value); | ||
/// self.field1.set(rem, value.as_ref()) | ||
/// }, | ||
/// "field2" => self.field2.set(rem, value.as_ref()), | ||
/// "field3" => self.field3.set(rem, value.as_ref()), | ||
/// _ => _internal_err!( | ||
/// "Config value \"{}\" not found on MyConfig", | ||
/// key | ||
|
@@ -102,15 +106,14 @@ use crate::{DataFusionError, Result}; | |
/// ``` | ||
/// | ||
/// NB: Misplaced commas may result in nonsensical errors | ||
/// | ||
#[macro_export] | ||
macro_rules! config_namespace { | ||
( | ||
$(#[doc = $struct_d:tt])* | ||
$vis:vis struct $struct_name:ident { | ||
$( | ||
$(#[doc = $d:tt])* | ||
$field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr | ||
$field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr | ||
)*$(,)* | ||
} | ||
) => { | ||
|
@@ -127,9 +130,13 @@ macro_rules! config_namespace { | |
impl ConfigField for $struct_name { | ||
fn set(&mut self, key: &str, value: &str) -> Result<()> { | ||
let (key, rem) = key.split_once('.').unwrap_or((key, "")); | ||
|
||
match key { | ||
$( | ||
stringify!($field_name) => self.$field_name.set(rem, value), | ||
stringify!($field_name) => { | ||
$(let value = $transform(value);)? | ||
self.$field_name.set(rem, value.as_ref()) | ||
}, | ||
)* | ||
_ => return _config_err!( | ||
"Config value \"{}\" not found on {}", key, stringify!($struct_name) | ||
|
@@ -212,11 +219,12 @@ config_namespace! { | |
pub enable_ident_normalization: bool, default = true | ||
|
||
/// When set to true, SQL parser will normalize options value (convert value to lowercase) | ||
pub enable_options_value_normalization: bool, default = true | ||
pub enable_options_value_normalization: bool, default = false | ||
|
||
/// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, | ||
/// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | ||
pub dialect: String, default = "generic".to_string() | ||
pub dialect: String, default = "generic".to_string() // no need to lowercase because | ||
// [`sqlparser::dialect_from_str`] is case-insensitive | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but | ||
/// ignore the length. If false, error if a `VARCHAR` with a length is | ||
|
@@ -431,7 +439,7 @@ config_namespace! { | |
/// | ||
/// Note that this default setting is not the same as | ||
/// the default parquet writer setting. | ||
pub compression: Option<String>, default = Some("zstd(3)".into()) | ||
pub compression: Option<String>, transform = str::to_lowercase, default = Some("zstd(3)".into()) | ||
|
||
/// (writing) Sets if dictionary encoding is enabled. If NULL, uses | ||
/// default parquet writer setting | ||
|
@@ -444,7 +452,7 @@ config_namespace! { | |
/// Valid values are: "none", "chunk", and "page" | ||
/// These values are not case sensitive. If NULL, uses | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now the |
||
/// default parquet writer setting | ||
pub statistics_enabled: Option<String>, default = Some("page".into()) | ||
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into()) | ||
|
||
/// (writing) Sets max statistics size for any column. If NULL, uses | ||
/// default parquet writer setting | ||
|
@@ -470,7 +478,7 @@ config_namespace! { | |
/// delta_byte_array, rle_dictionary, and byte_stream_split. | ||
/// These values are not case sensitive. If NULL, uses | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
this suggests the code reading this value does normalization and that it's no longer needed, can be simplified There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually think this is a good info for users that you assign any values to it - "plain", "PLAIN", "Plain", etc. Also, I think that's what Oleks asked in #13576 (comment) - so maybe let's keep it as is? |
||
/// default parquet writer setting | ||
pub encoding: Option<String>, default = None | ||
pub encoding: Option<String>, transform = str::to_lowercase, default = None | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// (writing) Use any available bloom filters when reading parquet files | ||
pub bloom_filter_on_read: bool, default = true | ||
|
@@ -971,29 +979,46 @@ impl<F: ConfigField + Default> ConfigField for Option<F> { | |
} | ||
} | ||
|
||
fn parse<T>(input: &str) -> Result<T> | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
where | ||
T: FromStr, | ||
T::Err: Display, | ||
blaginin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
<T as FromStr>::Err: Sync + Send + Error + 'static, | ||
{ | ||
input.parse().map_err(|e| { | ||
DataFusionError::Context( | ||
format!( | ||
"Error parsing '{}' as {}", | ||
input, | ||
std::any::type_name::<T>() | ||
), | ||
Box::new(DataFusionError::External(Box::new(e))), | ||
) | ||
}) | ||
} | ||
|
||
#[macro_export] | ||
macro_rules! config_field { | ||
($t:ty) => { | ||
config_field!($t, value => parse(value)?); | ||
}; | ||
|
||
($t:ty, $arg:ident => $transform:expr) => { | ||
impl ConfigField for $t { | ||
fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) { | ||
v.some(key, self, description) | ||
} | ||
|
||
fn set(&mut self, _: &str, value: &str) -> Result<()> { | ||
*self = value.parse().map_err(|e| { | ||
DataFusionError::Context( | ||
format!(concat!("Error parsing {} as ", stringify!($t),), value), | ||
Box::new(DataFusionError::External(Box::new(e))), | ||
) | ||
})?; | ||
fn set(&mut self, _: &str, $arg: &str) -> Result<()> { | ||
*self = $transform; | ||
Ok(()) | ||
} | ||
} | ||
}; | ||
} | ||
|
||
config_field!(String); | ||
config_field!(bool); | ||
config_field!(bool, value => parse(value.to_lowercase().as_str())?); | ||
config_field!(usize); | ||
config_field!(f64); | ||
config_field!(u64); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the important change in this PR, along with the definition of
transform
for option values.cc @alamb @comphead @jayzhan211
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWI @xinlifoobar and @berkaysynnada who worked on #11330 where this feature was added. Interestingly, this option was added in response to seemingly the same problem (AWS access keys being normalized) as faced by @tinfoil-knight here:
Rather than changing the default config values, we could potentially change the default in datafusion cli and/or improve documentation as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think that solution is good enough for the long-term.
I understand a global config was introduced to avoid changing behavior for everyone, but that clearly doesn't work when different parts of the system need different behavior. For example S3 component needs case-sensitivity whereas some other component, breaks when lowercasing is turned off.
The global config toggle solved the problem only for some use-cases and only for people knowing about the toggle. This PR aims to solve the problem the better way (IMO).
I would prefer we continue with the approach proposed here and later deprecate the global toggle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the linked issue and its PR, it seems what I am looking for is exactly what @ozankabak wanted in #11330 (comment)
this PR addresses this for built-in options. A nice generic normalization for extension options is yet to be addressed.