diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index a42963495617..58d84545dbb6 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -18,7 +18,7 @@ //! Built-in functions module contains all the built-in functions definitions. use crate::nullif::SUPPORTED_NULLIF_TYPES; -use crate::type_coercion::functions::data_types; +use crate::type_coercion::functions::{data_types, TIMEZONE_PLACEHOLDER}; use crate::{ conditional_expressions, struct_expressions, utils, Signature, TypeSignature, Volatility, @@ -1020,13 +1020,25 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::DateTrunc => Signature::one_of( vec![ Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Microsecond, None)]), - Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Microsecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Millisecond, None)]), - Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Millisecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Second, None)]), - Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Second, Some(TIMEZONE_PLACEHOLDER.into())), + ]), ], self.volatility(), ), @@ -1040,8 +1052,11 @@ impl BuiltinScalarFunction { ]), Exact(vec![ Interval(MonthDayNano), - Timestamp(array_type.clone(), Some("+TZ".into())), - Timestamp(Nanosecond, Some("+TZ".into())), + Timestamp( + array_type.clone(), + Some(TIMEZONE_PLACEHOLDER.into()), + ), + Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())), ]), Exact(vec![ Interval(DayTime), @@ -1050,8 +1065,11 @@ impl BuiltinScalarFunction { ]), Exact(vec![ Interval(DayTime), - Timestamp(array_type.clone(), Some("+TZ".into())), - Timestamp(Nanosecond, Some("+TZ".into())), + Timestamp( + array_type.clone(), + Some(TIMEZONE_PLACEHOLDER.into()), + ), + Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())), ]), Exact(vec![ Interval(MonthDayNano), @@ -1059,7 +1077,10 @@ impl BuiltinScalarFunction { ]), Exact(vec![ Interval(MonthDayNano), - Timestamp(array_type.clone(), Some("+TZ".into())), + Timestamp( + array_type.clone(), + Some(TIMEZONE_PLACEHOLDER.into()), + ), ]), Exact(vec![ Interval(DayTime), @@ -1067,7 +1088,7 @@ impl BuiltinScalarFunction { ]), Exact(vec![ Interval(DayTime), - Timestamp(array_type, Some("+TZ".into())), + Timestamp(array_type, Some(TIMEZONE_PLACEHOLDER.into())), ]), ] }; @@ -1085,13 +1106,25 @@ impl BuiltinScalarFunction { Exact(vec![Utf8, Date32]), Exact(vec![Utf8, Date64]), Exact(vec![Utf8, Timestamp(Second, None)]), - Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Second, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Microsecond, None)]), - Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Microsecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Millisecond, None)]), - Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Millisecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]), + Exact(vec![ + Utf8, + Timestamp(Nanosecond, Some(TIMEZONE_PLACEHOLDER.into())), + ]), ], self.volatility(), ), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 883ca2b39362..5452c8a5c8f5 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -22,6 +22,16 @@ use arrow::{ }; use datafusion_common::{plan_err, DataFusionError, Result}; +/// Constant that is used as a placeholder for any valid timezone. +/// This is used where a function can accept a timestamp type with any +/// valid timezone, it exists to avoid the need to enumerate all possible +/// timezones. +/// +/// Type coercion always ensures that functions will be executed using +/// timestamp arrays that have a valid time zone. Functions must never +/// return results with this timezone. +pub(crate) const TIMEZONE_PLACEHOLDER: &str = "+TZ"; + /// Performs type coercion for function arguments. /// /// Returns the data types to which each argument must be coerced to @@ -121,7 +131,7 @@ fn maybe_data_types( } else { // attempt to coerce if let Some(valid_type) = coerced_from(valid_type, current_type) { - new_type.push(valid_type.clone()) + new_type.push(valid_type) } else { // not possible return None; @@ -140,7 +150,7 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { return true; } if let Some(coerced) = coerced_from(type_into, type_from) { - return coerced == type_into; + return coerced == *type_into; } false } @@ -148,15 +158,17 @@ pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { fn coerced_from<'a>( type_into: &'a DataType, type_from: &'a DataType, -) -> Option<&'a DataType> { +) -> Option { use self::DataType::*; match type_into { // coerced into type_into - Int8 if matches!(type_from, Null | Int8) => Some(type_into), - Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) => Some(type_into), + Int8 if matches!(type_from, Null | Int8) => Some(type_into.clone()), + Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) => { + Some(type_into.clone()) + } Int32 if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16) => { - Some(type_into) + Some(type_into.clone()) } Int64 if matches!( @@ -164,13 +176,15 @@ fn coerced_from<'a>( Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 ) => { - Some(type_into) + Some(type_into.clone()) + } + UInt8 if matches!(type_from, Null | UInt8) => Some(type_into.clone()), + UInt16 if matches!(type_from, Null | UInt8 | UInt16) => Some(type_into.clone()), + UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => { + Some(type_into.clone()) } - UInt8 if matches!(type_from, Null | UInt8) => Some(type_into), - UInt16 if matches!(type_from, Null | UInt8 | UInt16) => Some(type_into), - UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => Some(type_into), UInt64 if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => { - Some(type_into) + Some(type_into.clone()) } Float32 if matches!( @@ -186,7 +200,7 @@ fn coerced_from<'a>( | Float32 ) => { - Some(type_into) + Some(type_into.clone()) } Float64 if matches!( @@ -204,7 +218,7 @@ fn coerced_from<'a>( | Decimal128(_, _) ) => { - Some(type_into) + Some(type_into.clone()) } Timestamp(TimeUnit::Nanosecond, None) if matches!( @@ -212,17 +226,31 @@ fn coerced_from<'a>( Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8 ) => { - Some(type_into) + Some(type_into.clone()) } - Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into), - Utf8 | LargeUtf8 => Some(type_into), - Null if can_cast_types(type_from, type_into) => Some(type_into), + Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()), + Utf8 | LargeUtf8 => Some(type_into.clone()), + Null if can_cast_types(type_from, type_into) => Some(type_into.clone()), - // Coerce to consistent timezones, if the `type_from` timezone exists. - Timestamp(TimeUnit::Nanosecond, Some(_)) - if matches!(type_from, Timestamp(TimeUnit::Nanosecond, Some(_))) => + Timestamp(unit, Some(tz)) if tz.as_ref() == TIMEZONE_PLACEHOLDER => { + match type_from { + Timestamp(_, Some(from_tz)) => { + Some(Timestamp(unit.clone(), Some(from_tz.clone()))) + } + Null | Date32 | Utf8 | LargeUtf8 => { + // In the absence of any other information assume the time zone is "+00" (UTC). + Some(Timestamp(unit.clone(), Some("+00".into()))) + } + _ => None, + } + } + Timestamp(_, Some(_)) + if matches!( + type_from, + Null | Timestamp(_, Some(_)) | Date32 | Utf8 | LargeUtf8 + ) => { - Some(type_from) + Some(type_into.clone()) } // cannot coerce @@ -233,7 +261,7 @@ fn coerced_from<'a>( #[cfg(test)] mod tests { use super::*; - use arrow::datatypes::DataType; + use arrow::datatypes::{DataType, TimeUnit}; #[test] fn test_maybe_data_types() { @@ -265,6 +293,20 @@ mod tests { vec![DataType::Boolean, DataType::UInt16], Some(vec![DataType::Boolean, DataType::UInt32]), ), + // UTF8 -> Timestamp + ( + vec![ + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+TZ".into())), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+01".into())), + ], + vec![DataType::Utf8, DataType::Utf8, DataType::Utf8], + Some(vec![ + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+00".into())), + DataType::Timestamp(TimeUnit::Nanosecond, Some("+01".into())), + ]), + ), ]; for case in cases { diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 88a024e0f9da..bb06c569f081 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1448,6 +1448,30 @@ SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 01:10:00+07', TIMESTAMPTZ '2020 ---- 2021-12-31T00:00:00Z +# postgresql: 2021-12-31 00:00:00+00 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 01:10:00+07', '2020-01-01') +---- +2021-12-31T00:00:00Z + +# postgresql: 2021-12-31 00:00:00+00 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 01:10:00+07', '2020-01-01T00:00:00Z') +---- +2021-12-31T00:00:00Z + +# postgresql: 2021-12-31 18:00:00+00 +query P +SELECT date_bin('2 hour', TIMESTAMPTZ '2022-01-01 01:10:00+07', '2020-01-01') +---- +2021-12-31T18:00:00Z + +# postgresql: 2021-12-31 18:00:00+00 +query P +SELECT date_bin('2 hour', TIMESTAMPTZ '2022-01-01 01:10:00+07', '2020-01-01T00:00:00Z') +---- +2021-12-31T18:00:00Z + # postgresql: 1 query R SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as part