diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a0a89fb3d14f..adb04a9a5a6e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1329,6 +1329,7 @@ dependencies = [ "hashbrown 0.14.5", "indexmap", "libc", + "log", "object_store", "parquet", "paste", diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs index de66b60fe449..045c924e5037 100644 --- a/datafusion-cli/src/object_storage.rs +++ b/datafusion-cli/src/object_storage.rs @@ -472,12 +472,13 @@ mod tests { #[tokio::test] async fn s3_object_store_builder() -> Result<()> { - let access_key_id = "fake_access_key_id"; - let secret_access_key = "fake_secret_access_key"; + // "fake" is uppercase to ensure the values are not lowercased when parsed + let access_key_id = "FAKE_access_key_id"; + let secret_access_key = "FAKE_secret_access_key"; let region = "fake_us-east-2"; let endpoint = "endpoint33"; - let session_token = "fake_session_token"; - let location = "s3://bucket/path/file.parquet"; + let session_token = "FAKE_session_token"; + let location = "s3://bucket/path/FAKE/file.parquet"; let table_url = ListingTableUrl::parse(location)?; let scheme = table_url.scheme(); diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 82909404e455..a81ec724dd66 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -57,6 +57,7 @@ half = { workspace = true } hashbrown = { workspace = true } indexmap = { workspace = true } libc = "0.2.140" +log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3a07a238a4c9..97fb7dcd84c2 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -19,6 +19,7 @@ use std::any::Any; use std::collections::{BTreeMap, HashMap}; +use std::error::Error; use std::fmt::{self, Display}; use std::str::FromStr; @@ -29,7 +30,9 @@ use crate::{DataFusionError, Result}; /// A macro that wraps a configuration struct and automatically derives /// [`Default`] and [`ConfigField`] for it, allowing it to be used -/// in the [`ConfigOptions`] configuration tree +/// in the [`ConfigOptions`] configuration tree. +/// +/// `transform` is used to normalize values before parsing. /// /// For example, /// @@ -38,7 +41,7 @@ use crate::{DataFusionError, Result}; /// /// Amazing config /// pub struct MyConfig { /// /// Field 1 doc -/// field1: String, default = "".to_string() +/// field1: String, transform = str::to_lowercase, default = "".to_string() /// /// /// Field 2 doc /// field2: usize, default = 232 @@ -67,9 +70,12 @@ use crate::{DataFusionError, Result}; /// fn set(&mut self, key: &str, value: &str) -> Result<()> { /// let (key, rem) = key.split_once('.').unwrap_or((key, "")); /// match key { -/// "field1" => self.field1.set(rem, value), -/// "field2" => self.field2.set(rem, value), -/// "field3" => self.field3.set(rem, value), +/// "field1" => { +/// let value = str::to_lowercase(value); +/// self.field1.set(rem, value.as_ref()) +/// }, +/// "field2" => self.field2.set(rem, value.as_ref()), +/// "field3" => self.field3.set(rem, value.as_ref()), /// _ => _internal_err!( /// "Config value \"{}\" not found on MyConfig", /// key @@ -102,7 +108,6 @@ use crate::{DataFusionError, Result}; /// ``` /// /// NB: Misplaced commas may result in nonsensical errors -/// #[macro_export] macro_rules! config_namespace { ( @@ -110,7 +115,7 @@ macro_rules! config_namespace { $vis:vis struct $struct_name:ident { $( $(#[doc = $d:tt])* - $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr + $field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr )*$(,)* } ) => { @@ -127,9 +132,14 @@ macro_rules! config_namespace { impl ConfigField for $struct_name { fn set(&mut self, key: &str, value: &str) -> Result<()> { let (key, rem) = key.split_once('.').unwrap_or((key, "")); + match key { $( - stringify!($field_name) => self.$field_name.set(rem, value), + stringify!($field_name) => { + $(let value = $transform(value);)? + $(log::warn!($warn);)? + self.$field_name.set(rem, value.as_ref()) + }, )* _ => return _config_err!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) @@ -211,12 +221,15 @@ config_namespace! { /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) pub enable_ident_normalization: bool, default = true - /// When set to true, SQL parser will normalize options value (convert value to lowercase) - pub enable_options_value_normalization: bool, default = true + /// When set to true, SQL parser will normalize options value (convert value to lowercase). + /// Note that this option is ignored and will be removed in the future. All case-insensitive values + /// are normalized automatically. + pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. pub dialect: String, default = "generic".to_string() + // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but /// ignore the length. If false, error if a `VARCHAR` with a length is @@ -431,7 +444,7 @@ config_namespace! { /// /// Note that this default setting is not the same as /// the default parquet writer setting. - pub compression: Option, default = Some("zstd(3)".into()) + pub compression: Option, transform = str::to_lowercase, default = Some("zstd(3)".into()) /// (writing) Sets if dictionary encoding is enabled. If NULL, uses /// default parquet writer setting @@ -444,7 +457,7 @@ config_namespace! { /// Valid values are: "none", "chunk", and "page" /// These values are not case sensitive. If NULL, uses /// default parquet writer setting - pub statistics_enabled: Option, default = Some("page".into()) + pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting @@ -470,7 +483,7 @@ config_namespace! { /// delta_byte_array, rle_dictionary, and byte_stream_split. /// These values are not case sensitive. If NULL, uses /// default parquet writer setting - pub encoding: Option, default = None + pub encoding: Option, transform = str::to_lowercase, default = None /// (writing) Use any available bloom filters when reading parquet files pub bloom_filter_on_read: bool, default = true @@ -971,21 +984,37 @@ impl ConfigField for Option { } } +fn default_transform(input: &str) -> Result +where + T: FromStr, + ::Err: Sync + Send + Error + 'static, +{ + input.parse().map_err(|e| { + DataFusionError::Context( + format!( + "Error parsing '{}' as {}", + input, + std::any::type_name::() + ), + Box::new(DataFusionError::External(Box::new(e))), + ) + }) +} + #[macro_export] macro_rules! config_field { ($t:ty) => { + config_field!($t, value => default_transform(value)?); + }; + + ($t:ty, $arg:ident => $transform:expr) => { impl ConfigField for $t { fn visit(&self, v: &mut V, key: &str, description: &'static str) { v.some(key, self, description) } - fn set(&mut self, _: &str, value: &str) -> Result<()> { - *self = value.parse().map_err(|e| { - DataFusionError::Context( - format!(concat!("Error parsing {} as ", stringify!($t),), value), - Box::new(DataFusionError::External(Box::new(e))), - ) - })?; + fn set(&mut self, _: &str, $arg: &str) -> Result<()> { + *self = $transform; Ok(()) } } @@ -993,7 +1022,7 @@ macro_rules! config_field { } config_field!(String); -config_field!(bool); +config_field!(bool, value => default_transform(value.to_lowercase().as_str())?); config_field!(usize); config_field!(f64); config_field!(u64); @@ -1508,7 +1537,7 @@ macro_rules! config_namespace_with_hashmap { $vis:vis struct $struct_name:ident { $( $(#[doc = $d:tt])* - $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr + $field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr )*$(,)* } ) => { @@ -1527,7 +1556,10 @@ macro_rules! config_namespace_with_hashmap { let (key, rem) = key.split_once('.').unwrap_or((key, "")); match key { $( - stringify!($field_name) => self.$field_name.set(rem, value), + stringify!($field_name) => { + $(let value = $transform(value);)? + self.$field_name.set(rem, value.as_ref()) + }, )* _ => _config_err!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) @@ -1606,7 +1638,7 @@ config_namespace_with_hashmap! { /// lzo, brotli(level), lz4, zstd(level), and lz4_raw. /// These values are not case-sensitive. If NULL, uses /// default parquet options - pub compression: Option, default = None + pub compression: Option, transform = str::to_lowercase, default = None /// Sets if statistics are enabled for the column /// Valid values are: "none", "chunk", and "page" diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index d8fad5b6cd37..2cea37fe17e2 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -62,7 +62,7 @@ impl TableProviderFactory for StreamTableFactory { let header = if let Ok(opt) = cmd .options .get("format.has_header") - .map(|has_header| bool::from_str(has_header)) + .map(|has_header| bool::from_str(has_header.to_lowercase().as_str())) .transpose() { opt.unwrap_or(false) diff --git a/datafusion/core/tests/config_from_env.rs b/datafusion/core/tests/config_from_env.rs index a5a5a4524e60..976597c8a9ac 100644 --- a/datafusion/core/tests/config_from_env.rs +++ b/datafusion/core/tests/config_from_env.rs @@ -22,10 +22,19 @@ use std::env; fn from_env() { // Note: these must be a single test to avoid interference from concurrent execution let env_key = "DATAFUSION_OPTIMIZER_FILTER_NULL_JOIN_KEYS"; - env::set_var(env_key, "true"); - let config = ConfigOptions::from_env().unwrap(); + // valid testing in different cases + for bool_option in ["true", "TRUE", "True", "tRUe"] { + env::set_var(env_key, bool_option); + let config = ConfigOptions::from_env().unwrap(); + env::remove_var(env_key); + assert!(config.optimizer.filter_null_join_keys); + } + + // invalid testing + env::set_var(env_key, "ttruee"); + let err = ConfigOptions::from_env().unwrap_err().strip_backtrace(); + assert_eq!(err, "Error parsing 'ttruee' as bool\ncaused by\nExternal error: provided string was not `true` or `false`"); env::remove_var(env_key); - assert!(config.optimizer.filter_null_join_keys); let env_key = "DATAFUSION_EXECUTION_BATCH_SIZE"; @@ -37,7 +46,7 @@ fn from_env() { // for invalid testing env::set_var(env_key, "abc"); let err = ConfigOptions::from_env().unwrap_err().strip_backtrace(); - assert_eq!(err, "Error parsing abc as usize\ncaused by\nExternal error: invalid digit found in string"); + assert_eq!(err, "Error parsing 'abc' as usize\ncaused by\nExternal error: invalid digit found in string"); env::remove_var(env_key); let config = ConfigOptions::from_env().unwrap(); diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 59fa4ca5f1f6..2d0ba8f8d994 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -24,10 +24,10 @@ use arrow_schema::*; use datafusion_common::{ field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, }; +use sqlparser::ast::TimezoneInfo; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; -use sqlparser::ast::{TimezoneInfo, Value}; use datafusion_common::TableReference; use datafusion_common::{ @@ -38,7 +38,7 @@ use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; -use crate::utils::{make_decimal_type, value_to_string}; +use crate::utils::make_decimal_type; pub use datafusion_expr::planner::ContextProvider; /// SQL parser options @@ -56,7 +56,7 @@ impl Default for ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, - enable_options_value_normalization: true, + enable_options_value_normalization: false, } } } @@ -87,32 +87,6 @@ impl IdentNormalizer { } } -/// Value Normalizer -#[derive(Debug)] -pub struct ValueNormalizer { - normalize: bool, -} - -impl Default for ValueNormalizer { - fn default() -> Self { - Self { normalize: true } - } -} - -impl ValueNormalizer { - pub fn new(normalize: bool) -> Self { - Self { normalize } - } - - pub fn normalize(&self, value: Value) -> Option { - match (value_to_string(&value), self.normalize) { - (Some(s), true) => Some(s.to_ascii_lowercase()), - (Some(s), false) => Some(s), - (None, _) => None, - } - } -} - /// Struct to store the states used by the Planner. The Planner will leverage the states to resolve /// CTEs, Views, subqueries and PREPARE statements. The states include /// Common Table Expression (CTE) provided with WITH clause and @@ -254,7 +228,6 @@ pub struct SqlToRel<'a, S: ContextProvider> { pub(crate) context_provider: &'a S, pub(crate) options: ParserOptions, pub(crate) ident_normalizer: IdentNormalizer, - pub(crate) value_normalizer: ValueNormalizer, } impl<'a, S: ContextProvider> SqlToRel<'a, S> { @@ -266,13 +239,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// Create a new query planner pub fn new_with_options(context_provider: &'a S, options: ParserOptions) -> Self { let ident_normalize = options.enable_ident_normalization; - let options_value_normalize = options.enable_options_value_normalization; SqlToRel { context_provider, options, ident_normalizer: IdentNormalizer::new(ident_normalize), - value_normalizer: ValueNormalizer::new(options_value_normalize), } } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 38695f98b5fe..f750afbc4a53 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1386,8 +1386,7 @@ impl SqlToRel<'_, S> { return plan_err!("Option {key} is specified multiple times"); } - let Some(value_string) = self.value_normalizer.normalize(value.clone()) - else { + let Some(value_string) = crate::utils::value_to_string(&value) else { return plan_err!("Unsupported Value {}", value); }; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 8c2d8ebad43f..9d52a2cc7b2a 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -29,11 +29,10 @@ use datafusion_common::{ }; use datafusion_expr::{ col, - dml::CopyTo, logical_plan::{LogicalPlan, Prepare}, test::function_stub::sum_udaf, - ColumnarValue, CreateExternalTable, CreateIndex, DdlStatement, ScalarUDF, - ScalarUDFImpl, Signature, Statement, Volatility, + ColumnarValue, CreateIndex, DdlStatement, ScalarUDF, ScalarUDFImpl, Signature, + Statement, Volatility, }; use datafusion_functions::{string, unicode}; use datafusion_sql::{ @@ -161,70 +160,6 @@ fn parse_ident_normalization() { } } -#[test] -fn test_parse_options_value_normalization() { - let test_data = [ - ( - "CREATE EXTERNAL TABLE test OPTIONS ('location' 'LoCaTiOn') STORED AS PARQUET LOCATION 'fake_location'", - "CreateExternalTable: Bare { table: \"test\" }", - HashMap::from([("format.location", "LoCaTiOn")]), - false, - ), - ( - "CREATE EXTERNAL TABLE test OPTIONS ('location' 'LoCaTiOn') STORED AS PARQUET LOCATION 'fake_location'", - "CreateExternalTable: Bare { table: \"test\" }", - HashMap::from([("format.location", "location")]), - true, - ), - ( - "COPY test TO 'fake_location' STORED AS PARQUET OPTIONS ('location' 'LoCaTiOn')", - "CopyTo: format=csv output_url=fake_location options: (format.location LoCaTiOn)\n TableScan: test", - HashMap::from([("format.location", "LoCaTiOn")]), - false, - ), - ( - "COPY test TO 'fake_location' STORED AS PARQUET OPTIONS ('location' 'LoCaTiOn')", - "CopyTo: format=csv output_url=fake_location options: (format.location location)\n TableScan: test", - HashMap::from([("format.location", "location")]), - true, - ), - ]; - - for (sql, expected_plan, expected_options, enable_options_value_normalization) in - test_data - { - let plan = logical_plan_with_options( - sql, - ParserOptions { - parse_float_as_decimal: false, - enable_ident_normalization: false, - support_varchar_with_length: false, - enable_options_value_normalization, - }, - ); - if let Ok(plan) = plan { - assert_eq!(expected_plan, format!("{plan}")); - - match plan { - LogicalPlan::Ddl(DdlStatement::CreateExternalTable( - CreateExternalTable { options, .. }, - )) - | LogicalPlan::Copy(CopyTo { options, .. }) => { - expected_options.iter().for_each(|(k, v)| { - assert_eq!(Some(&v.to_string()), options.get(*k)); - }); - } - _ => panic!( - "Expected Ddl(CreateExternalTable) or Copy(CopyTo) but got {:?}", - plan - ), - } - } else { - assert_eq!(expected_plan, plan.unwrap_err().strip_backtrace()); - } - } -} - #[test] fn select_no_relation() { quick_test( diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index ed001cf9f84c..6a63ea1cd3e4 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -226,6 +226,20 @@ OPTIONS ( has_header false, compression gzip); +# Verify that some options are case insensitive +statement ok +CREATE EXTERNAL TABLE IF NOT EXISTS region ( + r_regionkey BIGINT, + r_name VARCHAR, + r_comment VARCHAR, + r_rev VARCHAR, +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' +OPTIONS ( + format.delimiter '|', + has_header FALSE, + compression GZIP); + + # Create an external parquet table and infer schema to order by # query should succeed diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 4d51a61c8a52..1f6b5f9852ec 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -258,7 +258,7 @@ datafusion.optimizer.skip_failed_rules false datafusion.optimizer.top_down_join_key_reordering true datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true -datafusion.sql_parser.enable_options_value_normalization true +datafusion.sql_parser.enable_options_value_normalization false datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.support_varchar_with_length true @@ -351,7 +351,7 @@ datafusion.optimizer.skip_failed_rules false When set to true, the logical plan datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) -datafusion.sql_parser.enable_options_value_normalization true When set to true, SQL parser will normalize options value (convert value to lowercase) +datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt index 6f19c9f4d42f..bb4ac920d032 100644 --- a/datafusion/sqllogictest/test_files/set_variable.slt +++ b/datafusion/sqllogictest/test_files/set_variable.slt @@ -93,10 +93,10 @@ datafusion.execution.coalesce_batches false statement ok set datafusion.catalog.information_schema = true -statement error DataFusion error: Error parsing 1 as bool +statement error DataFusion error: Error parsing '1' as bool SET datafusion.execution.coalesce_batches to 1 -statement error DataFusion error: Error parsing abc as bool +statement error DataFusion error: Error parsing 'abc' as bool SET datafusion.execution.coalesce_batches to abc # set u64 variable @@ -132,10 +132,10 @@ datafusion.execution.batch_size 2 statement ok set datafusion.catalog.information_schema = true -statement error DataFusion error: Error parsing -1 as usize +statement error DataFusion error: Error parsing '-1' as usize SET datafusion.execution.batch_size to -1 -statement error DataFusion error: Error parsing abc as usize +statement error DataFusion error: Error parsing 'abc' as usize SET datafusion.execution.batch_size to abc statement error External error: invalid digit found in string diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6a49fda668a9..77433c85cb66 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -122,6 +122,6 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.explain.show_schema | false | When set to true, the explain statement will print schema information | | datafusion.sql_parser.parse_float_as_decimal | false | When set to true, SQL parser will parse float as decimal type | | datafusion.sql_parser.enable_ident_normalization | true | When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) | -| datafusion.sql_parser.enable_options_value_normalization | true | When set to true, SQL parser will normalize options value (convert value to lowercase) | +| datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. |