From aa8b0f8e4e71a8e2b076656e0a8bf00bfc15ecb8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 10 Oct 2023 16:14:51 -0700 Subject: [PATCH] Handle empty string correctly in Parquet statistics (#14257) An empty string should be a valid minimum value for a string column, but the current parquet writer considers an empty string to have no value when writing the column chunk statistics. This PR changes all fields in the Statistics struct to be `thrust::optional` to help distinguish between a valid empty string and no value. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14257 --- .../io/parquet/compact_protocol_reader.cpp | 15 ++-- .../io/parquet/compact_protocol_writer.cpp | 12 +-- cpp/src/io/parquet/parquet.hpp | 18 ++-- cpp/src/io/parquet/predicate_pushdown.cpp | 14 +-- cpp/tests/io/parquet_test.cpp | 85 +++++++++++++++---- 5 files changed, 104 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 81d1be64a45..1a345ee0750 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -767,12 +767,15 @@ bool CompactProtocolReader::read(ColumnIndex* c) bool CompactProtocolReader::read(Statistics* s) { - auto op = std::make_tuple(parquet_field_binary(1, s->max), - parquet_field_binary(2, s->min), - parquet_field_int64(3, s->null_count), - parquet_field_int64(4, s->distinct_count), - parquet_field_binary(5, s->max_value), - parquet_field_binary(6, s->min_value)); + using optional_binary = parquet_field_optional, parquet_field_binary>; + using optional_int64 = parquet_field_optional; + + auto op = std::make_tuple(optional_binary(1, s->max), + optional_binary(2, s->min), + optional_int64(3, s->null_count), + optional_int64(4, s->distinct_count), + optional_binary(5, s->max_value), + optional_binary(6, s->min_value)); return function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 9adc8767880..00810269d3c 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -195,12 +195,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s) size_t CompactProtocolWriter::write(Statistics const& s) { CompactProtocolFieldWriter c(*this); - if (not s.max.empty()) { c.field_binary(1, s.max); } - if (not s.min.empty()) { c.field_binary(2, s.min); } - if (s.null_count != -1) { c.field_int(3, s.null_count); } - if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); } - if (not s.max_value.empty()) { c.field_binary(5, s.max_value); } - if (not s.min_value.empty()) { c.field_binary(6, s.min_value); } + if (s.max.has_value()) { c.field_binary(1, s.max.value()); } + if (s.min.has_value()) { c.field_binary(2, s.min.value()); } + if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); } + if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } + if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } + if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } return c.value(); } diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index dbec59670c7..1cd16ac6102 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -215,12 +215,18 @@ struct SchemaElement { * @brief Thrift-derived struct describing column chunk statistics */ struct Statistics { - std::vector max; // deprecated max value in signed comparison order - std::vector min; // deprecated min value in signed comparison order - int64_t null_count = -1; // count of null values in the column - int64_t distinct_count = -1; // count of distinct values occurring - std::vector max_value; // max value for column determined by ColumnOrder - std::vector min_value; // min value for column determined by ColumnOrder + // deprecated max value in signed comparison order + thrust::optional> max; + // deprecated min value in signed comparison order + thrust::optional> min; + // count of null values in the column + thrust::optional null_count; + // count of distinct values occurring + thrust::optional distinct_count; + // max value for column determined by ColumnOrder + thrust::optional> max_value; + // min value for column determined by ColumnOrder + thrust::optional> min_value; }; /** diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 9083be1c2dd..a5851de3c20 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -150,12 +150,14 @@ struct stats_caster { { } - void set_index(size_type index, std::vector const& binary_value, Type const type) + void set_index(size_type index, + thrust::optional> const& binary_value, + Type const type) { - if (!binary_value.empty()) { - val[index] = convert(binary_value.data(), binary_value.size(), type); + if (binary_value.has_value()) { + val[index] = convert(binary_value.value().data(), binary_value.value().size(), type); } - if (binary_value.empty()) { + if (not binary_value.has_value()) { clear_bit_unsafe(null_mask.data(), index); null_count++; } @@ -210,10 +212,10 @@ struct stats_caster { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; auto const& colchunk = row_group.columns[col_idx]; // To support deprecated min, max fields. - auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0 + auto const& min_value = colchunk.meta_data.statistics.min_value.has_value() ? colchunk.meta_data.statistics.min_value : colchunk.meta_data.statistics.min; - auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0 + auto const& max_value = colchunk.meta_data.statistics.max_value.has_value() ? colchunk.meta_data.statistics.max_value : colchunk.meta_data.statistics.max; // translate binary data to Type then to diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 3e5d7033e60..fa85e3a4a1d 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4161,8 +4161,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); } } } @@ -4242,6 +4244,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; @@ -4250,10 +4255,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) EXPECT_FALSE(ci.null_pages[p]); // null_counts should always be 0 EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4344,7 +4349,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) auto const stats = get_statistics(chunk); // should be half nulls, except no nulls in column 0 - EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4356,10 +4364,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) } else { EXPECT_EQ(ci.null_counts[p], 0); } - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4436,7 +4444,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) auto const stats = get_statistics(chunk); // there should be no nulls except column 1 which is all nulls - EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0); + if (c != 1) { + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + } + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4449,12 +4462,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) } if (not ci.null_pages[p]) { EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } } for (size_t p = 0; p < ci.max_values.size(); p++) { if (not ci.null_pages[p]) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4533,13 +4546,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const ptype = fmd.schema[colidx].type; auto const ctype = fmd.schema[colidx].converted_type; for (size_t p = 0; p < ci.min_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4829,11 +4845,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0); @@ -4890,8 +4909,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(ci.min_values[0], truncated_min[c]); @@ -6737,6 +6758,38 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +TEST_F(ParquetWriterTest, EmptyMinStringStatistics) +{ + char const* const min_val = ""; + char const* const max_val = "zzz"; + std::vector strings{min_val, max_val, "pining", "for", "the", "fjords"}; + + column_wrapper string_col{strings.begin(), strings.end()}; + auto const output = table_view{{string_col}}; + auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0); + auto const& chunk = fmd.row_groups[0].columns[0]; + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const min_value = std::string{reinterpret_cast(stats.min_value.value().data()), + stats.min_value.value().size()}; + auto const max_value = std::string{reinterpret_cast(stats.max_value.value().data()), + stats.max_value.value().size()}; + EXPECT_EQ(min_value, std::string(min_val)); + EXPECT_EQ(max_value, std::string(max_val)); +} + TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { constexpr unsigned char repeated_bytes[] = {