Skip to content

Commit

Permalink
fix null_count on compute_record_batch_statistics to report null …
Browse files Browse the repository at this point in the history
…counts across partitions (apache#10468)

* fix null_count on compute_record_batch_statistics

* fmt

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
samuelcolvin and alamb authored May 17, 2024
1 parent eb3817a commit cfa7154
Showing 1 changed file with 41 additions and 3 deletions.
44 changes: 41 additions & 3 deletions datafusion/physical-plan/src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,23 @@ pub fn compute_record_batch_statistics(
})
.sum();

let mut column_statistics = vec![ColumnStatistics::new_unknown(); projection.len()];
let mut null_counts = vec![0; projection.len()];

for partition in batches.iter() {
for batch in partition {
for (stat_index, col_index) in projection.iter().enumerate() {
column_statistics[stat_index].null_count =
Precision::Exact(batch.column(*col_index).null_count());
null_counts[stat_index] += batch.column(*col_index).null_count();
}
}
}
let column_statistics = null_counts
.into_iter()
.map(|null_count| {
let mut s = ColumnStatistics::new_unknown();
s.null_count = Precision::Exact(null_count);
s
})
.collect();

Statistics {
num_rows: Precision::Exact(nb_rows),
Expand Down Expand Up @@ -687,4 +694,35 @@ mod tests {
assert_eq!(actual, expected);
Ok(())
}

#[test]
fn test_compute_record_batch_statistics_null() -> Result<()> {
let schema =
Arc::new(Schema::new(vec![Field::new("u64", DataType::UInt64, true)]));
let batch1 = RecordBatch::try_new(
Arc::clone(&schema),
vec![Arc::new(UInt64Array::from(vec![Some(1), None, None]))],
)?;
let batch2 = RecordBatch::try_new(
Arc::clone(&schema),
vec![Arc::new(UInt64Array::from(vec![Some(1), Some(2), None]))],
)?;
let byte_size = batch1.get_array_memory_size() + batch2.get_array_memory_size();
let actual =
compute_record_batch_statistics(&[vec![batch1], vec![batch2]], &schema, None);

let expected = Statistics {
num_rows: Precision::Exact(6),
total_byte_size: Precision::Exact(byte_size),
column_statistics: vec![ColumnStatistics {
distinct_count: Precision::Absent,
max_value: Precision::Absent,
min_value: Precision::Absent,
null_count: Precision::Exact(3),
}],
};

assert_eq!(actual, expected);
Ok(())
}
}

0 comments on commit cfa7154

Please sign in to comment.