@@ -125,22 +125,35 @@ pub fn from_thrift(
125
125
) -> Result < Option < Statistics > > {
126
126
Ok ( match thrift_stats {
127
127
Some ( stats) => {
128
- // Number of nulls recorded, when it is not available, we just mark it as 0.
129
- // TODO this should be `None` if there is no information about NULLS.
130
- // see https://github.com/apache/arrow-rs/pull/6216/files
131
- let null_count = stats. null_count . unwrap_or ( 0 ) ;
132
-
133
- if null_count < 0 {
134
- return Err ( ParquetError :: General ( format ! (
135
- "Statistics null count is negative {}" ,
136
- null_count
137
- ) ) ) ;
138
- }
128
+ // transform null count to u64
129
+ let null_count = stats
130
+ . null_count
131
+ . map ( |null_count| {
132
+ if null_count < 0 {
133
+ return Err ( ParquetError :: General ( format ! (
134
+ "Statistics null count is negative {}" ,
135
+ null_count
136
+ ) ) ) ;
137
+ }
138
+ Ok ( null_count as u64 )
139
+ } )
140
+ . transpose ( ) ?;
139
141
140
- // Generic null count.
141
- let null_count = Some ( null_count as u64 ) ;
142
142
// Generic distinct count (count of distinct values occurring)
143
- let distinct_count = stats. distinct_count . map ( |value| value as u64 ) ;
143
+ let distinct_count = stats
144
+ . distinct_count
145
+ . map ( |distinct_count| {
146
+ if distinct_count < 0 {
147
+ return Err ( ParquetError :: General ( format ! (
148
+ "Statistics distinct count is negative {}" ,
149
+ distinct_count
150
+ ) ) ) ;
151
+ }
152
+
153
+ Ok ( distinct_count as u64 )
154
+ } )
155
+ . transpose ( ) ?;
156
+
144
157
// Whether or not statistics use deprecated min/max fields.
145
158
let old_format = stats. min_value . is_none ( ) && stats. max_value . is_none ( ) ;
146
159
// Generic min value as bytes.
@@ -244,20 +257,21 @@ pub fn from_thrift(
244
257
pub fn to_thrift ( stats : Option < & Statistics > ) -> Option < TStatistics > {
245
258
let stats = stats?;
246
259
247
- // record null counts if greater than zero.
248
- //
249
- // TODO: This should be Some(0) if there are no nulls.
250
- // see https://github.com/apache/arrow-rs/pull/6216/files
260
+ // record null count if it can fit in i64
251
261
let null_count = stats
252
262
. null_count_opt ( )
253
- . map ( |value| value as i64 )
254
- . filter ( |& x| x > 0 ) ;
263
+ . and_then ( |value| i64:: try_from ( value) . ok ( ) ) ;
264
+
265
+ // record distinct count if it can fit in i64
266
+ let distinct_count = stats
267
+ . distinct_count ( )
268
+ . and_then ( |value| i64:: try_from ( value) . ok ( ) ) ;
255
269
256
270
let mut thrift_stats = TStatistics {
257
271
max : None ,
258
272
min : None ,
259
273
null_count,
260
- distinct_count : stats . distinct_count ( ) . map ( |value| value as i64 ) ,
274
+ distinct_count,
261
275
max_value : None ,
262
276
min_value : None ,
263
277
is_max_value_exact : None ,
@@ -404,9 +418,20 @@ impl Statistics {
404
418
/// Returns number of null values for the column, if known.
405
419
/// Note that this includes all nulls when column is part of the complex type.
406
420
///
407
- /// Note this API returns Some(0) even if the null count was not present
408
- /// in the statistics.
409
- /// See <https://github.com/apache/arrow-rs/pull/6216/files>
421
+ /// Note: Versions of this library prior to `53.0.0` returned 0 if the null count was
422
+ /// not available. This method returns `None` in that case.
423
+ ///
424
+ /// Also, versions of this library prior to `53.0.0` did not store the null count in the
425
+ /// statistics if the null count was `0`.
426
+ ///
427
+ /// To preserve the prior behavior and read null counts properly from older files
428
+ /// you should default to zero:
429
+ ///
430
+ /// ```no_run
431
+ /// # use parquet::file::statistics::Statistics;
432
+ /// # let statistics: Statistics = todo!();
433
+ /// let null_count = statistics.null_count_opt().unwrap_or(0);
434
+ /// ```
410
435
pub fn null_count_opt ( & self ) -> Option < u64 > {
411
436
statistics_enum_func ! [ self , null_count_opt]
412
437
}
@@ -1041,4 +1066,91 @@ mod tests {
1041
1066
true ,
1042
1067
) ) ;
1043
1068
}
1069
+
1070
+ #[ test]
1071
+ fn test_count_encoding ( ) {
1072
+ statistics_count_test ( None , None ) ;
1073
+ statistics_count_test ( Some ( 0 ) , Some ( 0 ) ) ;
1074
+ statistics_count_test ( Some ( 100 ) , Some ( 2000 ) ) ;
1075
+ statistics_count_test ( Some ( 1 ) , None ) ;
1076
+ statistics_count_test ( None , Some ( 1 ) ) ;
1077
+ }
1078
+
1079
+ #[ test]
1080
+ fn test_count_encoding_distinct_too_large ( ) {
1081
+ // statistics are stored using i64, so test trying to store larger values
1082
+ let statistics = make_bool_stats ( Some ( u64:: MAX ) , Some ( 100 ) ) ;
1083
+ let thrift_stats = to_thrift ( Some ( & statistics) ) . unwrap ( ) ;
1084
+ assert_eq ! ( thrift_stats. distinct_count, None ) ; // can't store u64 max --> null
1085
+ assert_eq ! ( thrift_stats. null_count, Some ( 100 ) ) ;
1086
+ }
1087
+
1088
+ #[ test]
1089
+ fn test_count_encoding_null_too_large ( ) {
1090
+ // statistics are stored using i64, so test trying to store larger values
1091
+ let statistics = make_bool_stats ( Some ( 100 ) , Some ( u64:: MAX ) ) ;
1092
+ let thrift_stats = to_thrift ( Some ( & statistics) ) . unwrap ( ) ;
1093
+ assert_eq ! ( thrift_stats. distinct_count, Some ( 100 ) ) ;
1094
+ assert_eq ! ( thrift_stats. null_count, None ) ; // can' store u64 max --> null
1095
+ }
1096
+
1097
+ #[ test]
1098
+ fn test_count_decoding_distinct_invalid ( ) {
1099
+ let tstatistics = TStatistics {
1100
+ distinct_count : Some ( -42 ) ,
1101
+ ..Default :: default ( )
1102
+ } ;
1103
+ let err = from_thrift ( Type :: BOOLEAN , Some ( tstatistics) ) . unwrap_err ( ) ;
1104
+ assert_eq ! (
1105
+ err. to_string( ) ,
1106
+ "Parquet error: Statistics distinct count is negative -42"
1107
+ ) ;
1108
+ }
1109
+
1110
+ #[ test]
1111
+ fn test_count_decoding_null_invalid ( ) {
1112
+ let tstatistics = TStatistics {
1113
+ null_count : Some ( -42 ) ,
1114
+ ..Default :: default ( )
1115
+ } ;
1116
+ let err = from_thrift ( Type :: BOOLEAN , Some ( tstatistics) ) . unwrap_err ( ) ;
1117
+ assert_eq ! (
1118
+ err. to_string( ) ,
1119
+ "Parquet error: Statistics null count is negative -42"
1120
+ ) ;
1121
+ }
1122
+
1123
+ /// Writes statistics to thrift and reads them back and ensures:
1124
+ /// - The statistics are the same
1125
+ /// - The statistics written to thrift are the same as the original statistics
1126
+ fn statistics_count_test ( distinct_count : Option < u64 > , null_count : Option < u64 > ) {
1127
+ let statistics = make_bool_stats ( distinct_count, null_count) ;
1128
+
1129
+ let thrift_stats = to_thrift ( Some ( & statistics) ) . unwrap ( ) ;
1130
+ assert_eq ! ( thrift_stats. null_count. map( |c| c as u64 ) , null_count) ;
1131
+ assert_eq ! (
1132
+ thrift_stats. distinct_count. map( |c| c as u64 ) ,
1133
+ distinct_count
1134
+ ) ;
1135
+
1136
+ let round_tripped = from_thrift ( Type :: BOOLEAN , Some ( thrift_stats) )
1137
+ . unwrap ( )
1138
+ . unwrap ( ) ;
1139
+ assert_eq ! ( round_tripped, statistics) ;
1140
+ }
1141
+
1142
+ fn make_bool_stats ( distinct_count : Option < u64 > , null_count : Option < u64 > ) -> Statistics {
1143
+ let min = Some ( true ) ;
1144
+ let max = Some ( false ) ;
1145
+ let is_min_max_deprecated = false ;
1146
+
1147
+ // test is about the counts, so we aren't really testing the min/max values
1148
+ Statistics :: Boolean ( ValueStatistics :: new (
1149
+ min,
1150
+ max,
1151
+ distinct_count,
1152
+ null_count,
1153
+ is_min_max_deprecated,
1154
+ ) )
1155
+ }
1044
1156
}
0 commit comments