From ffeda62fc9d6a182c1bf3b3212e676f74fc196df Mon Sep 17 00:00:00 2001 From: Jeffrey <22608443+Jefffrey@users.noreply.github.com> Date: Wed, 8 Nov 2023 02:20:50 +1100 Subject: [PATCH] Parquet f32/f64 handle signed zeros in statistics (#5048) --- parquet/src/column/writer/encoder.rs | 29 ++++++- parquet/src/column/writer/mod.rs | 120 ++++++++++++++++++++++++++- parquet/src/data_type.rs | 4 +- 3 files changed, 147 insertions(+), 6 deletions(-) diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 7bd4db30c3a8..5fd0f9e194d2 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::basic::Encoding; +use crate::basic::{Encoding, Type}; use crate::bloom_filter::Sbbf; use crate::column::writer::{ compare_greater, fallback_encoding, has_dictionary_support, is_nan, update_max, update_min, @@ -308,5 +308,30 @@ where max = val; } } - Some((min.clone(), max.clone())) + + // Float/Double statistics have special case for zero. + // + // If computed min is zero, whether negative or positive, + // the spec states that the min should be written as -0.0 + // (negative zero) + // + // For max, it has similar logic but will be written as 0.0 + // (positive zero) + let min = replace_zero(min, -0.0); + let max = replace_zero(max, 0.0); + + Some((min, max)) +} + +#[inline] +fn replace_zero(val: &T, replace: f32) -> T { + match T::PHYSICAL_TYPE { + Type::FLOAT if f32::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { + T::try_from_le_slice(&f32::to_le_bytes(replace)).unwrap() + } + Type::DOUBLE if f64::from_le_bytes(val.as_bytes().try_into().unwrap()) == 0.0 => { + T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap() + } + _ => val.clone(), + } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 84bf1911d89c..307804e7dc5c 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -2111,6 +2111,64 @@ mod tests { assert!(matches!(stats, Statistics::Float(_))); } + #[test] + fn test_float_statistics_zero_only() { + let stats = statistics_roundtrip::(&[0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_neg_zero_only() { + let stats = statistics_roundtrip::(&[-0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_zero_min() { + let stats = statistics_roundtrip::(&[0.0, 1.0, f32::NAN, 2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Float"); + } + } + + #[test] + fn test_float_statistics_neg_zero_max() { + let stats = statistics_roundtrip::(&[-0.0, -1.0, f32::NAN, -2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Float"); + } + } + #[test] fn test_double_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f64::NAN, 2.0]); @@ -2120,7 +2178,7 @@ mod tests { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); } else { - panic!("expecting Statistics::Float"); + panic!("expecting Statistics::Double"); } } @@ -2133,7 +2191,7 @@ mod tests { assert_eq!(stats.min(), &1.0); assert_eq!(stats.max(), &2.0); } else { - panic!("expecting Statistics::Float"); + panic!("expecting Statistics::Double"); } } @@ -2145,6 +2203,64 @@ mod tests { assert!(stats.is_min_max_backwards_compatible()); } + #[test] + fn test_double_statistics_zero_only() { + let stats = statistics_roundtrip::(&[0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_neg_zero_only() { + let stats = statistics_roundtrip::(&[-0.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_zero_min() { + let stats = statistics_roundtrip::(&[0.0, 1.0, f64::NAN, 2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-0.0); + assert!(stats.min().is_sign_negative()); + assert_eq!(stats.max(), &2.0); + } else { + panic!("expecting Statistics::Double"); + } + } + + #[test] + fn test_double_statistics_neg_zero_max() { + let stats = statistics_roundtrip::(&[-0.0, -1.0, f64::NAN, -2.0]); + assert!(stats.has_min_max_set()); + assert!(stats.is_min_max_backwards_compatible()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &0.0); + assert!(stats.max().is_sign_positive()); + } else { + panic!("expecting Statistics::Double"); + } + } + #[test] fn test_compare_greater_byte_array_decimals() { assert!(!compare_greater_byte_array_decimals(&[], &[],),); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 7e64478ed940..eaf4389d4350 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -632,7 +632,7 @@ pub(crate) mod private { /// Return the value as i64 if possible /// - /// This is essentially the same as `std::convert::TryInto` but can + /// This is essentially the same as `std::convert::TryInto` but can't be /// implemented for `f32` and `f64`, types that would fail orphan rules fn as_i64(&self) -> Result { Err(general_err!("Type cannot be converted to i64")) @@ -640,7 +640,7 @@ pub(crate) mod private { /// Return the value as u64 if possible /// - /// This is essentially the same as `std::convert::TryInto` but can + /// This is essentially the same as `std::convert::TryInto` but can't be /// implemented for `f32` and `f64`, types that would fail orphan rules fn as_u64(&self) -> Result { self.as_i64()