Skip to content

Commit

Permalink
Fix incorrect statistics read for binary columns in parquet (#10645)
Browse files Browse the repository at this point in the history
  • Loading branch information
xinlifoobar authored May 25, 2024
1 parent 1211335 commit d10b1a4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ macro_rules! get_statistic {
*scale,
))
}
Some(DataType::Binary) => {
Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
}
_ => {
let s = std::str::from_utf8(s.$bytes_func())
.map(|s| s.to_string())
Expand Down Expand Up @@ -644,10 +647,6 @@ mod test {
}

#[test]
#[should_panic(
expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Utf8, got Binary(NULL)"
)]
// Due to https://github.com/apache/datafusion/issues/8295
fn roundtrip_binary() {
Test {
input: Arc::new(BinaryArray::from_opt_vec(vec![
Expand Down
25 changes: 12 additions & 13 deletions datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ use std::sync::Arc;
use arrow::compute::kernels::cast_utils::Parser;
use arrow::datatypes::{Date32Type, Date64Type};
use arrow_array::{
make_array, Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Decimal128Array,
FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
RecordBatch, StringArray, UInt64Array,
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
Decimal128Array, FixedSizeBinaryArray, Float64Array, Int16Array, Int32Array,
Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
};
use arrow_schema::{DataType, Field, Schema};
use datafusion::datasource::physical_plan::parquet::{
Expand Down Expand Up @@ -905,18 +905,17 @@ async fn test_byte() {
.run();

// column "service_binary"

let expected_service_binary_min_values: Vec<&[u8]> =
vec![b"frontend five", b"backend one", b"backend eight"];

let expected_service_binary_max_values: Vec<&[u8]> =
vec![b"frontend two", b"frontend six", b"backend six"];

Test {
reader: reader.build().await,
expected_min: Arc::new(StringArray::from(vec![
"frontend five",
"backend one",
"backend eight",
])), // Shuld be BinaryArray
expected_max: Arc::new(StringArray::from(vec![
"frontend two",
"frontend six",
"backend six",
])), // Shuld be BinaryArray
expected_min: Arc::new(BinaryArray::from(expected_service_binary_min_values)), // Shuld be BinaryArray
expected_max: Arc::new(BinaryArray::from(expected_service_binary_max_values)), // Shuld be BinaryArray
expected_null_counts: UInt64Array::from(vec![0, 0, 0]),
expected_row_counts: UInt64Array::from(vec![5, 5, 5]),
column_name: "service_binary",
Expand Down

0 comments on commit d10b1a4

Please sign in to comment.