-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update ListingTable to use StatisticsConverter #11068
Conversation
@@ -1449,8 +1420,14 @@ mod tests { | |||
// column c1 | |||
let c1_stats = &stats.column_statistics[0]; | |||
assert_eq!(c1_stats.null_count, Precision::Exact(1)); | |||
assert_eq!(c1_stats.max_value, Precision::Absent); | |||
assert_eq!(c1_stats.min_value, Precision::Absent); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test behaviour changes due to Utf8 is not supported in
datafusion/datafusion/core/src/datasource/file_format/parquet.rs
Lines 298 to 376 in ea21b08
fn summarize_min_max( | |
max_values: &mut [Option<MaxAccumulator>], | |
min_values: &mut [Option<MinAccumulator>], | |
fields: &Fields, | |
i: usize, | |
stat: &ParquetStatistics, | |
) { | |
if !stat.has_min_max_set() { | |
max_values[i] = None; | |
min_values[i] = None; | |
return; | |
} | |
match stat { | |
ParquetStatistics::Boolean(s) if DataType::Boolean == *fields[i].data_type() => { | |
if let Some(max_value) = &mut max_values[i] { | |
max_value | |
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.max()]))]) | |
.unwrap_or_else(|_| max_values[i] = None); | |
} | |
if let Some(min_value) = &mut min_values[i] { | |
min_value | |
.update_batch(&[Arc::new(BooleanArray::from(vec![*s.min()]))]) | |
.unwrap_or_else(|_| min_values[i] = None); | |
} | |
} | |
ParquetStatistics::Int32(s) if DataType::Int32 == *fields[i].data_type() => { | |
if let Some(max_value) = &mut max_values[i] { | |
max_value | |
.update_batch(&[Arc::new(Int32Array::from_value(*s.max(), 1))]) | |
.unwrap_or_else(|_| max_values[i] = None); | |
} | |
if let Some(min_value) = &mut min_values[i] { | |
min_value | |
.update_batch(&[Arc::new(Int32Array::from_value(*s.min(), 1))]) | |
.unwrap_or_else(|_| min_values[i] = None); | |
} | |
} | |
ParquetStatistics::Int64(s) if DataType::Int64 == *fields[i].data_type() => { | |
if let Some(max_value) = &mut max_values[i] { | |
max_value | |
.update_batch(&[Arc::new(Int64Array::from_value(*s.max(), 1))]) | |
.unwrap_or_else(|_| max_values[i] = None); | |
} | |
if let Some(min_value) = &mut min_values[i] { | |
min_value | |
.update_batch(&[Arc::new(Int64Array::from_value(*s.min(), 1))]) | |
.unwrap_or_else(|_| min_values[i] = None); | |
} | |
} | |
ParquetStatistics::Float(s) if DataType::Float32 == *fields[i].data_type() => { | |
if let Some(max_value) = &mut max_values[i] { | |
max_value | |
.update_batch(&[Arc::new(Float32Array::from(vec![*s.max()]))]) | |
.unwrap_or_else(|_| max_values[i] = None); | |
} | |
if let Some(min_value) = &mut min_values[i] { | |
min_value | |
.update_batch(&[Arc::new(Float32Array::from(vec![*s.min()]))]) | |
.unwrap_or_else(|_| min_values[i] = None); | |
} | |
} | |
ParquetStatistics::Double(s) if DataType::Float64 == *fields[i].data_type() => { | |
if let Some(max_value) = &mut max_values[i] { | |
max_value | |
.update_batch(&[Arc::new(Float64Array::from(vec![*s.max()]))]) | |
.unwrap_or_else(|_| max_values[i] = None); | |
} | |
if let Some(min_value) = &mut min_values[i] { | |
min_value | |
.update_batch(&[Arc::new(Float64Array::from(vec![*s.min()]))]) | |
.unwrap_or_else(|_| min_values[i] = None); | |
} | |
} | |
_ => { | |
max_values[i] = None; | |
min_values[i] = None; | |
} | |
} | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is an improvement!
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Outdated
Show resolved
Hide resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you @xinlifoobar -- this looks like a great improvement. I really like how you structured the code
&file_schema, | ||
file_metadata.schema_descr(), | ||
) { | ||
Ok(stats_converter) => { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is a very nice formulation
* Update ListingTable to use StatisticsConverter * complete support for all types parquet * Fix misc * Fix misc * fix test * fix misc * fix none
* Update ListingTable to use StatisticsConverter * complete support for all types parquet * Fix misc * Fix misc * fix test * fix misc * fix none
* Update ListingTable to use StatisticsConverter * complete support for all types parquet * Fix misc * Fix misc * fix test * fix misc * fix none
Which issue does this PR close?
Closes #10923
Closes #10924
Rationale for this change
What changes are included in this PR?
Are these changes tested?
Are there any user-facing changes?