Skip to content

Commit 2d67c4c

Browse files
etseidlalamb
andauthored
Add configuration option to StatisticsConverter to control interpretation of missing null counts in Parquet statistics (#6485)
* Write null counts in parquet files when they are present * revert writing of Some(0) * fix docstring --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent e538289 commit 2d67c4c

File tree

2 files changed

+96
-5
lines changed

2 files changed

+96
-5
lines changed

parquet/src/arrow/arrow_reader/statistics.rs

+32-2
Original file line numberDiff line numberDiff line change
@@ -1159,7 +1159,7 @@ where
11591159
///
11601160
/// # Schemas
11611161
///
1162-
/// The converter ues the schema of the Parquet file and the Arrow schema to
1162+
/// The converter uses the schema of the Parquet file and the Arrow schema to
11631163
/// convert the underlying statistics value (stored as a parquet value) into the
11641164
/// corresponding Arrow value. For example, Decimals are stored as binary in
11651165
/// parquet files and this structure handles mapping them to the `i128`
@@ -1175,6 +1175,8 @@ pub struct StatisticsConverter<'a> {
11751175
parquet_column_index: Option<usize>,
11761176
/// The field (with data type) of the column in the Arrow schema
11771177
arrow_field: &'a Field,
1178+
/// treat missing null_counts as 0 nulls
1179+
missing_null_counts_as_zero: bool,
11781180
}
11791181

11801182
impl<'a> StatisticsConverter<'a> {
@@ -1191,6 +1193,23 @@ impl<'a> StatisticsConverter<'a> {
11911193
self.arrow_field
11921194
}
11931195

1196+
/// Set the statistics converter to treat missing null counts as missing
1197+
///
1198+
/// By default, the converter will treat missing null counts as though
1199+
/// the null count is known to be `0`.
1200+
///
1201+
/// Note that parquet files written by parquet-rs currently do not store
1202+
/// null counts even when it is known there are zero nulls, and the reader
1203+
/// will return 0 for the null counts in that instance. This behavior may
1204+
/// change in a future release.
1205+
///
1206+
/// Both parquet-java and parquet-cpp store null counts as 0 when there are
1207+
/// no nulls, and don't write unknown values to the null count field.
1208+
pub fn with_missing_null_counts_as_zero(mut self, missing_null_counts_as_zero: bool) -> Self {
1209+
self.missing_null_counts_as_zero = missing_null_counts_as_zero;
1210+
self
1211+
}
1212+
11941213
/// Returns a [`UInt64Array`] with row counts for each row group
11951214
///
11961215
/// # Return Value
@@ -1284,6 +1303,7 @@ impl<'a> StatisticsConverter<'a> {
12841303
Ok(Self {
12851304
parquet_column_index: parquet_index,
12861305
arrow_field,
1306+
missing_null_counts_as_zero: true,
12871307
})
12881308
}
12891309

@@ -1382,7 +1402,15 @@ impl<'a> StatisticsConverter<'a> {
13821402
let null_counts = metadatas
13831403
.into_iter()
13841404
.map(|x| x.column(parquet_index).statistics())
1385-
.map(|s| s.and_then(|s| s.null_count_opt()));
1405+
.map(|s| {
1406+
s.and_then(|s| {
1407+
if self.missing_null_counts_as_zero {
1408+
Some(s.null_count_opt().unwrap_or(0))
1409+
} else {
1410+
s.null_count_opt()
1411+
}
1412+
})
1413+
});
13861414
Ok(UInt64Array::from_iter(null_counts))
13871415
}
13881416

@@ -1593,3 +1621,5 @@ impl<'a> StatisticsConverter<'a> {
15931621
new_null_array(data_type, num_row_groups)
15941622
}
15951623
}
1624+
1625+
// See tests in parquet/tests/arrow_reader/statistics.rs

parquet/tests/arrow_reader/statistics.rs

+64-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::default::Default;
2222
use std::fs::File;
2323
use std::sync::Arc;
2424

25+
use super::make_test_file_rg;
2526
use super::{struct_array, Scenario};
2627
use arrow::compute::kernels::cast_utils::Parser;
2728
use arrow::datatypes::{
@@ -37,16 +38,17 @@ use arrow_array::{
3738
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
3839
UInt32Array, UInt64Array, UInt8Array,
3940
};
40-
use arrow_schema::{DataType, Field, Schema, TimeUnit};
41+
use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit};
4142
use half::f16;
4243
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
4344
use parquet::arrow::arrow_reader::{
4445
ArrowReaderBuilder, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
4546
};
4647
use parquet::arrow::ArrowWriter;
48+
use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData};
4749
use parquet::file::properties::{EnabledStatistics, WriterProperties};
48-
49-
use super::make_test_file_rg;
50+
use parquet::file::statistics::{Statistics, ValueStatistics};
51+
use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor};
5052

5153
#[derive(Debug, Default, Clone)]
5254
struct Int64Case {
@@ -2139,6 +2141,65 @@ async fn test_missing_statistics() {
21392141
.run();
21402142
}
21412143

2144+
#[test]
2145+
fn missing_null_counts_as_zero() {
2146+
let min = None;
2147+
let max = None;
2148+
let distinct_count = None;
2149+
let null_count = None; // NB: no null count
2150+
let is_min_max_deprecated = false;
2151+
let stats = Statistics::Boolean(ValueStatistics::new(
2152+
min,
2153+
max,
2154+
distinct_count,
2155+
null_count,
2156+
is_min_max_deprecated,
2157+
));
2158+
let (arrow_schema, parquet_schema) = bool_arrow_and_parquet_schema();
2159+
2160+
let column_chunk = ColumnChunkMetaData::builder(parquet_schema.column(0))
2161+
.set_statistics(stats)
2162+
.build()
2163+
.unwrap();
2164+
let metadata = RowGroupMetaData::builder(parquet_schema.clone())
2165+
.set_column_metadata(vec![column_chunk])
2166+
.build()
2167+
.unwrap();
2168+
2169+
let converter = StatisticsConverter::try_new("b", &arrow_schema, &parquet_schema).unwrap();
2170+
2171+
// by default null count should be 0
2172+
assert_eq!(
2173+
converter.row_group_null_counts([&metadata]).unwrap(),
2174+
UInt64Array::from_iter(vec![Some(0)])
2175+
);
2176+
2177+
// if we disable missing null counts as zero flag null count will be None
2178+
let converter = converter.with_missing_null_counts_as_zero(false);
2179+
assert_eq!(
2180+
converter.row_group_null_counts([&metadata]).unwrap(),
2181+
UInt64Array::from_iter(vec![None])
2182+
);
2183+
}
2184+
2185+
/// return an Arrow schema and corresponding Parquet SchemaDescriptor for
2186+
/// a schema with a single boolean column "b"
2187+
fn bool_arrow_and_parquet_schema() -> (SchemaRef, SchemaDescPtr) {
2188+
let arrow_schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Boolean, true)]));
2189+
use parquet::schema::types::Type as ParquetType;
2190+
let parquet_schema = ParquetType::group_type_builder("schema")
2191+
.with_fields(vec![Arc::new(
2192+
ParquetType::primitive_type_builder("a", parquet::basic::Type::INT32)
2193+
.build()
2194+
.unwrap(),
2195+
)])
2196+
.build()
2197+
.unwrap();
2198+
2199+
let parquet_schema = Arc::new(SchemaDescriptor::new(Arc::new(parquet_schema)));
2200+
(arrow_schema, parquet_schema)
2201+
}
2202+
21422203
/////// NEGATIVE TESTS ///////
21432204
// column not found
21442205
#[tokio::test]

0 commit comments

Comments
 (0)