From cfa7154a695cb1a6b3feb348d7e7958c5c94ae8a Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 16 May 2024 20:33:50 -0400 Subject: [PATCH] fix `null_count` on `compute_record_batch_statistics` to report null counts across partitions (#10468) * fix null_count on compute_record_batch_statistics * fmt --------- Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/common.rs | 44 ++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index 9e2216ae0a63..c61e9a05bfa6 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -153,16 +153,23 @@ pub fn compute_record_batch_statistics( }) .sum(); - let mut column_statistics = vec![ColumnStatistics::new_unknown(); projection.len()]; + let mut null_counts = vec![0; projection.len()]; for partition in batches.iter() { for batch in partition { for (stat_index, col_index) in projection.iter().enumerate() { - column_statistics[stat_index].null_count = - Precision::Exact(batch.column(*col_index).null_count()); + null_counts[stat_index] += batch.column(*col_index).null_count(); } } } + let column_statistics = null_counts + .into_iter() + .map(|null_count| { + let mut s = ColumnStatistics::new_unknown(); + s.null_count = Precision::Exact(null_count); + s + }) + .collect(); Statistics { num_rows: Precision::Exact(nb_rows), @@ -687,4 +694,35 @@ mod tests { assert_eq!(actual, expected); Ok(()) } + + #[test] + fn test_compute_record_batch_statistics_null() -> Result<()> { + let schema = + Arc::new(Schema::new(vec![Field::new("u64", DataType::UInt64, true)])); + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(UInt64Array::from(vec![Some(1), None, None]))], + )?; + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(UInt64Array::from(vec![Some(1), Some(2), None]))], + )?; + let byte_size = batch1.get_array_memory_size() + batch2.get_array_memory_size(); + let actual = + compute_record_batch_statistics(&[vec![batch1], vec![batch2]], &schema, None); + + let expected = Statistics { + num_rows: Precision::Exact(6), + total_byte_size: Precision::Exact(byte_size), + column_statistics: vec![ColumnStatistics { + distinct_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + null_count: Precision::Exact(3), + }], + }; + + assert_eq!(actual, expected); + Ok(()) + } }