@@ -22,6 +22,7 @@ use std::default::Default;
22
22
use std:: fs:: File ;
23
23
use std:: sync:: Arc ;
24
24
25
+ use super :: make_test_file_rg;
25
26
use super :: { struct_array, Scenario } ;
26
27
use arrow:: compute:: kernels:: cast_utils:: Parser ;
27
28
use arrow:: datatypes:: {
@@ -37,16 +38,17 @@ use arrow_array::{
37
38
TimestampMillisecondArray , TimestampNanosecondArray , TimestampSecondArray , UInt16Array ,
38
39
UInt32Array , UInt64Array , UInt8Array ,
39
40
} ;
40
- use arrow_schema:: { DataType , Field , Schema , TimeUnit } ;
41
+ use arrow_schema:: { DataType , Field , Schema , SchemaRef , TimeUnit } ;
41
42
use half:: f16;
42
43
use parquet:: arrow:: arrow_reader:: statistics:: StatisticsConverter ;
43
44
use parquet:: arrow:: arrow_reader:: {
44
45
ArrowReaderBuilder , ArrowReaderOptions , ParquetRecordBatchReaderBuilder ,
45
46
} ;
46
47
use parquet:: arrow:: ArrowWriter ;
48
+ use parquet:: file:: metadata:: { ColumnChunkMetaData , RowGroupMetaData } ;
47
49
use parquet:: file:: properties:: { EnabledStatistics , WriterProperties } ;
48
-
49
- use super :: make_test_file_rg ;
50
+ use parquet :: file :: statistics :: { Statistics , ValueStatistics } ;
51
+ use parquet :: schema :: types :: { SchemaDescPtr , SchemaDescriptor } ;
50
52
51
53
#[ derive( Debug , Default , Clone ) ]
52
54
struct Int64Case {
@@ -2139,6 +2141,65 @@ async fn test_missing_statistics() {
2139
2141
. run ( ) ;
2140
2142
}
2141
2143
2144
+ #[ test]
2145
+ fn missing_null_counts_as_zero ( ) {
2146
+ let min = None ;
2147
+ let max = None ;
2148
+ let distinct_count = None ;
2149
+ let null_count = None ; // NB: no null count
2150
+ let is_min_max_deprecated = false ;
2151
+ let stats = Statistics :: Boolean ( ValueStatistics :: new (
2152
+ min,
2153
+ max,
2154
+ distinct_count,
2155
+ null_count,
2156
+ is_min_max_deprecated,
2157
+ ) ) ;
2158
+ let ( arrow_schema, parquet_schema) = bool_arrow_and_parquet_schema ( ) ;
2159
+
2160
+ let column_chunk = ColumnChunkMetaData :: builder ( parquet_schema. column ( 0 ) )
2161
+ . set_statistics ( stats)
2162
+ . build ( )
2163
+ . unwrap ( ) ;
2164
+ let metadata = RowGroupMetaData :: builder ( parquet_schema. clone ( ) )
2165
+ . set_column_metadata ( vec ! [ column_chunk] )
2166
+ . build ( )
2167
+ . unwrap ( ) ;
2168
+
2169
+ let converter = StatisticsConverter :: try_new ( "b" , & arrow_schema, & parquet_schema) . unwrap ( ) ;
2170
+
2171
+ // by default null count should be 0
2172
+ assert_eq ! (
2173
+ converter. row_group_null_counts( [ & metadata] ) . unwrap( ) ,
2174
+ UInt64Array :: from_iter( vec![ Some ( 0 ) ] )
2175
+ ) ;
2176
+
2177
+ // if we disable missing null counts as zero flag null count will be None
2178
+ let converter = converter. with_missing_null_counts_as_zero ( false ) ;
2179
+ assert_eq ! (
2180
+ converter. row_group_null_counts( [ & metadata] ) . unwrap( ) ,
2181
+ UInt64Array :: from_iter( vec![ None ] )
2182
+ ) ;
2183
+ }
2184
+
2185
+ /// return an Arrow schema and corresponding Parquet SchemaDescriptor for
2186
+ /// a schema with a single boolean column "b"
2187
+ fn bool_arrow_and_parquet_schema ( ) -> ( SchemaRef , SchemaDescPtr ) {
2188
+ let arrow_schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new( "b" , DataType :: Boolean , true ) ] ) ) ;
2189
+ use parquet:: schema:: types:: Type as ParquetType ;
2190
+ let parquet_schema = ParquetType :: group_type_builder ( "schema" )
2191
+ . with_fields ( vec ! [ Arc :: new(
2192
+ ParquetType :: primitive_type_builder( "a" , parquet:: basic:: Type :: INT32 )
2193
+ . build( )
2194
+ . unwrap( ) ,
2195
+ ) ] )
2196
+ . build ( )
2197
+ . unwrap ( ) ;
2198
+
2199
+ let parquet_schema = Arc :: new ( SchemaDescriptor :: new ( Arc :: new ( parquet_schema) ) ) ;
2200
+ ( arrow_schema, parquet_schema)
2201
+ }
2202
+
2142
2203
/////// NEGATIVE TESTS ///////
2143
2204
// column not found
2144
2205
#[ tokio:: test]
0 commit comments