From efcb93d54b388b18fc177692ccb455cd8c998bc6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 23 May 2024 13:48:40 -0400 Subject: [PATCH 1/3] Add numeric limits tests for statistics reading --- .../core/tests/parquet/arrow_statistics.rs | 164 +++++++++++++++++- datafusion/core/tests/parquet/mod.rs | 39 +++++ 2 files changed, 197 insertions(+), 6 deletions(-) diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index db687a3777a4..710066703022 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -25,8 +25,8 @@ use arrow::compute::kernels::cast_utils::Parser; use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array, - FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - RecordBatch, StringArray, UInt64Array, + FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, RecordBatch, StringArray, UInt64Array, }; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::parquet::{ @@ -189,7 +189,10 @@ impl Test { .extract(reader.metadata()) .unwrap(); - assert_eq!(&min, &expected_min, "Mismatch with expected minimums"); + assert_eq!( + &min, &expected_min, + "{column_name}: Mismatch with expected minimums" + ); let max = StatisticsConverter::try_new( column_name, @@ -199,7 +202,10 @@ impl Test { .unwrap() .extract(reader.metadata()) .unwrap(); - assert_eq!(&max, &expected_max, "Mismatch with expected maximum"); + assert_eq!( + &max, &expected_max, + "{column_name}: Mismatch with expected maximum" + ); let null_counts = StatisticsConverter::try_new( column_name, @@ -212,13 +218,13 @@ impl Test { let expected_null_counts = Arc::new(expected_null_counts) as ArrayRef; assert_eq!( &null_counts, &expected_null_counts, - "Mismatch with expected null counts" + "{column_name}: Mismatch with expected null counts" ); let row_counts = StatisticsConverter::row_counts(reader.metadata()).unwrap(); assert_eq!( row_counts, expected_row_counts, - "Mismatch with expected row counts" + "{column_name}: Mismatch with expected row counts" ); } @@ -802,6 +808,152 @@ async fn test_uint32_range() { .run(); } +#[tokio::test] +async fn test_numeric_limits_unsigned() { + // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows. + let reader = TestReader { + scenario: Scenario::NumericLimits, + row_per_group: 5, + }; + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])), + expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i8", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])), + expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i16", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])), + expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i32", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])), + expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i64", + } + .run(); +} +#[tokio::test] +async fn test_numeric_limits_signed() { + // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows. + let reader = TestReader { + scenario: Scenario::NumericLimits, + row_per_group: 5, + }; + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])), + expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i8", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])), + expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i16", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])), + expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i32", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])), + expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "i64", + } + .run(); +} + +#[tokio::test] +async fn test_numeric_limits_float() { + // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows. + let reader = TestReader { + scenario: Scenario::NumericLimits, + row_per_group: 5, + }; + + Test { + reader: reader.build().await, + expected_min: Arc::new(Float32Array::from(vec![f32::MIN, -100.0])), + expected_max: Arc::new(Float32Array::from(vec![100.0, f32::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "f32", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Float64Array::from(vec![f64::MIN, -100.0])), + expected_max: Arc::new(Float64Array::from(vec![100.0, f64::MAX])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "f64", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Float32Array::from(vec![-1.0, -100.0])), + expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "f32_nan", + } + .run(); + + Test { + reader: reader.build().await, + expected_min: Arc::new(Float64Array::from(vec![-1.0, -100.0])), + expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])), + expected_null_counts: UInt64Array::from(vec![0, 0]), + expected_row_counts: UInt64Array::from(vec![5, 2]), + column_name: "f64_nan", + } + .run(); +} + #[tokio::test] async fn test_float64() { // This creates a parquet file of 1 column "f" diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index c5d0ad60bc10..a0b62d700104 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -73,6 +73,9 @@ enum Scenario { Int32Range, UInt, UInt32Range, + /// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64 + /// -MIN, -100, -1, 0, 1, 100, MAX + NumericLimits, Float64, Decimal, DecimalBloomFilterInt32, @@ -710,6 +713,39 @@ fn make_int_batches_with_null( .unwrap() } +fn make_numeric_limit_batch() -> RecordBatch { + let i8 = Int8Array::from(vec![i8::MIN, 100, -1, 0, 1, -100, i8::MAX]); + let i16 = Int16Array::from(vec![i16::MIN, 100, -1, 0, 1, -100, i16::MAX]); + let i32 = Int32Array::from(vec![i32::MIN, 100, -1, 0, 1, -100, i32::MAX]); + let i64 = Int64Array::from(vec![i64::MIN, 100, -1, 0, 1, -100, i64::MAX]); + let u8 = UInt8Array::from(vec![u8::MIN, 100, 1, 0, 1, 100, u8::MAX]); + let u16 = UInt16Array::from(vec![u16::MIN, 100, 1, 0, 1, 100, u16::MAX]); + let u32 = UInt32Array::from(vec![u32::MIN, 100, 1, 0, 1, 100, u32::MAX]); + let u64 = UInt64Array::from(vec![u64::MIN, 100, 1, 0, 1, 100, u64::MAX]); + let f32 = Float32Array::from(vec![f32::MIN, 100.0, -1.0, 0.0, 1.0, -100.0, f32::MAX]); + let f64 = Float64Array::from(vec![f64::MIN, 100.0, -1.0, 0.0, 1.0, -100.0, f64::MAX]); + let f32_nan = + Float32Array::from(vec![f32::NAN, 100.0, -1.0, 0.0, 1.0, -100.0, f32::NAN]); + let f64_nan = + Float64Array::from(vec![f64::NAN, 100.0, -1.0, 0.0, 1.0, -100.0, f64::NAN]); + + RecordBatch::try_from_iter(vec![ + ("i8", Arc::new(i8) as _), + ("i16", Arc::new(i16) as _), + ("i32", Arc::new(i32) as _), + ("i64", Arc::new(i64) as _), + ("u8", Arc::new(u8) as _), + ("u16", Arc::new(u16) as _), + ("u32", Arc::new(u32) as _), + ("u64", Arc::new(u64) as _), + ("f32", Arc::new(f32) as _), + ("f64", Arc::new(f64) as _), + ("f32_nan", Arc::new(f32_nan) as _), + ("f64_nan", Arc::new(f64_nan) as _), + ]) + .unwrap() +} + fn create_data_batch(scenario: Scenario) -> Vec { match scenario { Scenario::Boolean => { @@ -768,6 +804,9 @@ fn create_data_batch(scenario: Scenario) -> Vec { Scenario::UInt32Range => { vec![make_uint32_range(0, 10), make_uint32_range(200000, 300000)] } + Scenario::NumericLimits => { + vec![make_numeric_limit_batch()] + } Scenario::Float64 => { vec![ make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]), From eb8b6795119b58cc5a8335a015c9834389d80ce1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 25 May 2024 05:47:27 -0400 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Oleks V --- datafusion/core/tests/parquet/arrow_statistics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 710066703022..8e7ab584fd9e 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -218,13 +218,13 @@ impl Test { let expected_null_counts = Arc::new(expected_null_counts) as ArrayRef; assert_eq!( &null_counts, &expected_null_counts, - "{column_name}: Mismatch with expected null counts" + "{column_name}: Mismatch with expected null counts. Actual: {null_counts}. Expected: {expected_null_counts}" ); let row_counts = StatisticsConverter::row_counts(reader.metadata()).unwrap(); assert_eq!( row_counts, expected_row_counts, - "{column_name}: Mismatch with expected row counts" + "{column_name}: Mismatch with expected row counts. Actual: {row_counts}. Expected: {expected_row_counts}" ); } From 652a52758bfad2a5d2b90efb0c9e80f8adaba118 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 25 May 2024 05:49:46 -0400 Subject: [PATCH 3/3] fix --- datafusion/core/tests/parquet/arrow_statistics.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 8e7ab584fd9e..fdc1bab710dd 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -218,13 +218,15 @@ impl Test { let expected_null_counts = Arc::new(expected_null_counts) as ArrayRef; assert_eq!( &null_counts, &expected_null_counts, - "{column_name}: Mismatch with expected null counts. Actual: {null_counts}. Expected: {expected_null_counts}" + "{column_name}: Mismatch with expected null counts. \ + Actual: {null_counts:?}. Expected: {expected_null_counts:?}" ); let row_counts = StatisticsConverter::row_counts(reader.metadata()).unwrap(); assert_eq!( row_counts, expected_row_counts, - "{column_name}: Mismatch with expected row counts. Actual: {row_counts}. Expected: {expected_row_counts}" + "{column_name}: Mismatch with expected row counts. \ + Actual: {row_counts:?}. Expected: {expected_row_counts:?}" ); }