From b3b5ce94a576bd19967e41ef6c82ff94342e7b80 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 7 Nov 2024 18:22:10 -0600 Subject: [PATCH] Add optional column_order in JSON reader (#17029) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds optional column order to enforce column order in the output. This feature is required by spark from_json. Optional `column_order` is added to `schema_element`, and it is validated during reader_option creation. The column order can be specified at root level and for any struct in any level. • For root level, the dtypes should be schema_element with type STRUCT. (schema_element is added to variant dtypes) • For nested level, column_order can be specified for any STRUCT type. (could be a map of schema_element , or schema_element) If the column order is not specified, the order of columns is same as the order of columns that appear in json file. Closes #17240 (metadata updated) Closes #17091 (will return all nulls column if not present in input json) Closes #17090 (fixed with new schema_element as dtype) Closes #16799 (output columns are created from column_order if present) Authors: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17029 --- cpp/include/cudf/io/json.hpp | 53 ++++- cpp/src/io/json/host_tree_algorithms.cu | 28 ++- cpp/src/io/json/json_column.cu | 114 +++++++--- cpp/src/io/json/nested_json.hpp | 23 ++ cpp/src/io/json/nested_json_gpu.cu | 4 +- cpp/src/io/json/parser_features.cpp | 192 ++++++++++++++++ cpp/tests/io/json/json_test.cpp | 283 +++++++++++++++++++++++- 7 files changed, 637 insertions(+), 60 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index b662b660557..7cd4697f592 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -18,6 +18,7 @@ #include "types.hpp" +#include #include #include #include @@ -53,6 +54,11 @@ struct schema_element { * @brief Allows specifying this column's child columns target type */ std::map child_types; + + /** + * @brief Allows specifying the order of the columns + */ + std::optional> column_order; }; /** @@ -87,13 +93,18 @@ enum class json_recovery_mode_t { * | `chunksize` | use `byte_range_xxx` for chunking instead | */ class json_reader_options { + public: + using dtype_variant = + std::variant, + std::map, + std::map, + schema_element>; ///< Variant type holding dtypes information for the columns + + private: source_info _source; // Data types of the column; empty to infer dtypes - std::variant, - std::map, - std::map> - _dtypes; + dtype_variant _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -178,13 +189,7 @@ class json_reader_options { * * @returns Data types of the columns */ - [[nodiscard]] std::variant, - std::map, - std::map> const& - get_dtypes() const - { - return _dtypes; - } + [[nodiscard]] dtype_variant const& get_dtypes() const { return _dtypes; } /** * @brief Returns compression format of the source. @@ -228,7 +233,11 @@ class json_reader_options { */ [[nodiscard]] size_t get_byte_range_padding() const { - auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes); + auto const num_columns = + std::visit(cudf::detail::visitor_overload{ + [](auto const& dtypes) { return dtypes.size(); }, + [](schema_element const& dtypes) { return dtypes.child_types.size(); }}, + _dtypes); auto const max_row_bytes = 16 * 1024; // 16KB auto const column_bytes = 64; @@ -390,6 +399,14 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types schema element with column names and column order to support arbitrary nesting of + * data types + */ + void set_dtypes(schema_element types); + /** * @brief Set the compression type. * @@ -624,6 +641,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Struct schema_element with Column name -> schema_element with map and order + * @return this for chaining + */ + json_reader_options_builder& dtypes(schema_element types) + { + options.set_dtypes(std::move(types)); + return *this; + } + /** * @brief Set the compression type. * diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 570a00cbfc2..7fafa885c66 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -269,7 +269,8 @@ std::map unified_schema(cudf::io::json_reader_optio }); return dnew; }, - [](std::map const& user_dtypes) { return user_dtypes; }}, + [](std::map const& user_dtypes) { return user_dtypes; }, + [](schema_element const& user_dtypes) { return user_dtypes.child_types; }}, options.get_dtypes()); } @@ -492,7 +493,7 @@ std::pair, hashmap_of_device_columns> build_tree auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); - auto lookup_names = [&column_names](auto child_ids, auto name) { + auto lookup_names = [&column_names](auto const& child_ids, auto const& name) { for (auto const& child_id : child_ids) { if (column_names[child_id] == name) return child_id; } @@ -569,7 +570,7 @@ std::pair, hashmap_of_device_columns> build_tree for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); i++) { NodeIndexT const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; auto value_id = std::stol(name); if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); @@ -580,7 +581,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); } @@ -589,10 +590,19 @@ std::pair, hashmap_of_device_columns> build_tree std::map const& user_dtypes) -> void { for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { auto const first_child_id = adj[root_list_col_id][i]; - auto name = column_names[first_child_id]; + auto const& name = column_names[first_child_id]; if (user_dtypes.count(name)) mark_is_pruned(first_child_id, user_dtypes.at(name)); } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + schema_element const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto const& name = column_names[first_child_id]; + if (user_dtypes.child_types.count(name) != 0) + mark_is_pruned(first_child_id, user_dtypes.child_types.at(name)); + } }}, options.get_dtypes()); } else { @@ -626,7 +636,9 @@ std::pair, hashmap_of_device_columns> build_tree [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( std::map const& user_dtypes) -> void { mark_is_pruned(root_struct_col_id, u_schema); - }}, + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema](schema_element const& user_dtypes) + -> void { mark_is_pruned(root_struct_col_id, u_schema); }}, options.get_dtypes()); } // Useful for array of arrays @@ -714,7 +726,7 @@ std::pair, hashmap_of_device_columns> build_tree if (expected_category == NC_STRUCT) { // find field column ids, and its children and create columns. for (auto const& field_id : child_ids) { - auto name = column_names[field_id]; + auto const& name = column_names[field_id]; if (is_pruned[field_id]) continue; auto inserted = ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; @@ -745,7 +757,7 @@ std::pair, hashmap_of_device_columns> build_tree std::map> array_values; for (auto const& child_id : child_ids) { if (is_pruned[child_id]) continue; - auto name = column_names[child_id]; + auto const& name = column_names[child_id]; array_values[std::stoi(name)].push_back(child_id); } // diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 7e4d975e431..30a154fdda2 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -399,9 +399,9 @@ std::pair, std::vector> device_json_co // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema return {std::move(col), std::vector{}}; @@ -410,12 +410,37 @@ std::pair, std::vector> device_json_co std::vector> child_columns; std::vector column_names{}; size_type num_rows{json_col.num_rows}; + + bool const has_column_order = + prune_columns and not schema.value_or(schema_element{}) + .column_order.value_or(std::vector{}) + .empty(); + + auto const& col_order = + has_column_order ? schema.value().column_order.value() : json_col.column_order; + // Create children columns - for (auto const& col_name : json_col.column_order) { - auto const& col = json_col.child_columns.find(col_name); - column_names.emplace_back(col->first); - auto& child_col = col->second; + for (auto const& col_name : col_order) { auto child_schema_element = get_child_schema(col_name); + auto const found_it = json_col.child_columns.find(col_name); + + if (prune_columns and found_it == std::end(json_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + column_names.emplace_back(make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), col_name)); + auto all_null_column = make_all_nulls_column( + child_schema_element.value_or(schema_element{data_type{type_id::EMPTY}}), + num_rows, + stream, + mr); + child_columns.emplace_back(std::move(all_null_column)); + continue; + } + column_names.emplace_back(found_it->first); + + auto& child_col = found_it->second; if (!prune_columns or child_schema_element.has_value()) { auto [child_column, names] = device_json_column_to_cudf_column( child_col, d_input, options, prune_columns, child_schema_element, stream, mr); @@ -576,11 +601,21 @@ table_with_metadata device_parse_nested_json(device_span d_input, std::vector out_column_names; auto parse_opt = parsing_options(options, stream); - // Iterate over the struct's child columns and convert to cudf column - size_type column_index = 0; - for (auto const& col_name : root_struct_col.column_order) { - auto& json_col = root_struct_col.child_columns.find(col_name)->second; + schema_element const* prune_schema = std::get_if(&options.get_dtypes()); + bool const has_column_order = options.is_enabled_prune_columns() and prune_schema != nullptr and + prune_schema->column_order.has_value() and + not prune_schema->column_order->empty(); + auto const& col_order = + has_column_order ? prune_schema->column_order.value() : root_struct_col.column_order; + if (has_column_order) { + CUDF_EXPECTS(prune_schema->child_types.size() == col_order.size(), + "Input schema column order size mismatch with input schema child types"); + } + auto root_col_size = root_struct_col.num_rows; + // Iterate over the struct's child columns/column_order and convert to cudf column + size_type column_index = 0; + for (auto const& col_name : col_order) { std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ [column_index](std::vector const& user_dtypes) -> std::optional { @@ -590,17 +625,23 @@ table_with_metadata device_parse_nested_json(device_span d_input, }, [col_name]( std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) + return std::optional{{it->second}}; + return std::nullopt; }, [col_name](std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; + if (auto it = user_dtypes.find(col_name); it != std::end(user_dtypes)) return it->second; + return std::nullopt; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + if (auto it = user_dtypes.child_types.find(col_name); + it != std::end(user_dtypes.child_types)) + return it->second; + return std::nullopt; }}, options.get_dtypes()); + #ifdef NJP_DEBUG_PRINT auto debug_schema_print = [](auto ret) { std::cout << ", type id: " @@ -608,20 +649,39 @@ table_with_metadata device_parse_nested_json(device_span d_input, << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" << "\n"; }; - std::visit( - cudf::detail::visitor_overload{[column_index](std::vector const&) { - std::cout << "Column by index: #" << column_index; - }, - [col_name](std::map const&) { - std::cout << "Column by flat name: '" << col_name; - }, - [col_name](std::map const&) { - std::cout << "Column by nested name: #" << col_name; - }}, - options.get_dtypes()); + std::visit(cudf::detail::visitor_overload{ + [column_index](std::vector const&) { + std::cout << "Column by index: #" << column_index; + }, + [col_name](std::map const&) { + std::cout << "Column by flat name: '" << col_name; + }, + [col_name](std::map const&) { + std::cout << "Column by nested name: #" << col_name; + }, + [col_name](schema_element const&) { + std::cout << "Column by nested schema with column order: #" << col_name; + }}, + options.get_dtypes()); debug_schema_print(child_schema_element); #endif + auto const found_it = root_struct_col.child_columns.find(col_name); + if (options.is_enabled_prune_columns() and + found_it == std::end(root_struct_col.child_columns)) { + CUDF_EXPECTS(child_schema_element.has_value(), + "Column name not found in input schema map, but present in column order and " + "prune_columns is enabled"); + // inserts all null column + out_column_names.emplace_back(make_column_name_info(child_schema_element.value(), col_name)); + auto all_null_column = + make_all_nulls_column(child_schema_element.value(), root_col_size, stream, mr); + out_columns.emplace_back(std::move(all_null_column)); + column_index++; + continue; + } + auto& json_col = found_it->second; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { // Get this JSON column's cudf column and schema info, (modifies json_col) auto [cudf_col, col_name_info] = diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 7b3b04dea16..4989fff4b30 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,29 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create all null column of a given nested schema + * + * @param schema The schema of the column to create + * @param num_rows The number of rows in the column + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The all null column + */ +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Create metadata for a column of a given schema + * + * @param schema The schema of the column + * @param col_name The name of the column + * @return column metadata for a given schema + */ +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); + /** * @brief Get the path data type of a column by path if present in input schema * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 60e78f4763d..f1c2826c62a 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2198,9 +2198,9 @@ std::pair, std::vector> json_column_to // - String columns will be returned as nullable, iff there's at least one null entry if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } - // For string columns return ["offsets", "char"] schema + // For string columns return ["offsets"] schema if (target_type.id() == type_id::STRING) { - return {std::move(col), std::vector{{"offsets"}, {"chars"}}}; + return {std::move(col), std::vector{{"offsets"}}}; } // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema else { diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 4caa5cd9e24..401a6e992de 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -16,14 +16,201 @@ #include "nested_json.hpp" +#include +#include +#include #include +#include +#include +#include +#include +#include #include #include #include +namespace cudf::io { +namespace { +bool validate_column_order(schema_element const& types) +{ + // For struct types, check if column_order size matches child_types size and all elements in + // column_order are in child_types, in child_types, call this function recursively. + // For list types, check if child_types size is 1 and call this function recursively. + if (types.type.id() == type_id::STRUCT) { + if (types.column_order.has_value()) { + if (types.column_order.value().size() != types.child_types.size()) { return false; } + for (auto const& column_name : types.column_order.value()) { + auto it = types.child_types.find(column_name); + if (it == types.child_types.end()) { return false; } + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + } + } else if (types.type.id() == type_id::LIST) { + if (types.child_types.size() != 1) { return false; } + auto it = types.child_types.begin(); + if (it->second.type.id() == type_id::STRUCT or it->second.type.id() == type_id::LIST) { + if (!validate_column_order(it->second)) { return false; } + } + } + return true; +} +} // namespace + +void json_reader_options::set_dtypes(schema_element types) +{ + CUDF_EXPECTS( + validate_column_order(types), "Column order does not match child types", std::invalid_argument); + _dtypes = std::move(types); +} +} // namespace cudf::io + namespace cudf::io::json::detail { +/// Created an empty column of the specified schema +struct empty_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + template ())> + std::unique_ptr operator()(schema_element const& schema) const + { + return make_empty_column(schema.type); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); + auto offsets = make_empty_column(data_type(type_to_id())); + return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name))); + } + return make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } +}; + +/// Created all null column of the specified schema +struct allnull_column_functor { + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + private: + auto make_zeroed_offsets(size_type size) const + { + auto offsets_buff = + cudf::detail::make_zeroed_device_uvector_async(size + 1, stream, mr); + return std::make_unique(std::move(offsets_buff), rmm::device_buffer{}, 0); + } + + public: + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + } + + template ())> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); + auto indices = make_zeroed_offsets(size - 1); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_dictionary_column( + std::move(child), std::move(indices), std::move(null_mask), size, stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_strings_column( + size, std::move(offsets), rmm::device_buffer{}, size, std::move(null_mask)); + } + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, + empty_column_functor{stream, mr}, + schema.child_types.at(child_name)); + auto offsets = make_zeroed_offsets(size); + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_lists_column( + size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + } + + template )> + std::unique_ptr operator()(schema_element const& schema, size_type size) const + { + std::vector> child_columns; + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + child_columns.push_back(cudf::type_dispatcher( + schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); + } + auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); + return make_structs_column( + size, std::move(child_columns), size, std::move(null_mask), stream, mr); + } +}; + +std::unique_ptr make_all_nulls_column(schema_element const& schema, + size_type num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, allnull_column_functor{stream, mr}, schema, num_rows); +} + +column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name) +{ + column_name_info info; + info.name = col_name; + switch (schema.type.id()) { + case type_id::STRUCT: + for (auto const& child_name : schema.column_order.value_or(std::vector{})) { + info.children.push_back( + make_column_name_info(schema.child_types.at(child_name), child_name)); + } + break; + case type_id::LIST: + info.children.emplace_back("offsets"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::DICTIONARY32: + info.children.emplace_back("indices"); + for (auto const& [child_name, child_schema] : schema.child_types) { + info.children.push_back(make_column_name_info(child_schema, child_name)); + } + break; + case type_id::STRING: info.children.emplace_back("offsets"); break; + default: break; + } + return info; +} + std::optional child_schema_element(std::string const& col_name, cudf::io::json_reader_options const& options) { @@ -46,6 +233,11 @@ std::optional child_schema_element(std::string const& col_name, return (user_dtypes.find(col_name) != std::end(user_dtypes)) ? user_dtypes.find(col_name)->second : std::optional{}; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) + ? user_dtypes.child_types.find(col_name)->second + : std::optional{}; }}, options.get_dtypes()); } diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index b58ca56e066..199b0092473 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -239,7 +239,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest(), scale}}) + .dtypes(std::vector{data_type{type_to_id(), scale}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -324,7 +324,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -348,7 +348,8 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) - .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) + .dtypes(std::map{ + {"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -466,7 +467,7 @@ TEST_P(JsonReaderParamTest, Booleans) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -508,7 +509,7 @@ TEST_P(JsonReaderParamTest, Dates) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) + .dtypes(std::vector{data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) .dayfirst(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -564,7 +565,7 @@ TEST_P(JsonReaderParamTest, Durations) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) + .dtypes(std::vector{data_type{type_id::DURATION_NANOSECONDS}}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1022,7 +1023,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) - .dtypes({dtype()}) + .dtypes(std::vector{dtype()}) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -1461,7 +1462,7 @@ TEST_F(JsonReaderTest, ErrorStrings) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{cudf::type_id::STRING}}) + .dtypes(std::vector{data_type{cudf::type_id::STRING}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -1849,7 +1850,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues) cudf::io::json_reader_options const in_opts = cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .dtypes({data_type{type_to_id(), 0}}) + .dtypes(std::vector{data_type{type_to_id(), 0}}) .lines(true); auto const result = cudf::io::read_json(in_opts); @@ -2827,7 +2828,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2865,7 +2866,7 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 1); EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); // types EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); @@ -2991,4 +2992,264 @@ TEST_F(JsonReaderTest, LastRecordInvalid) CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), cudf::table_view{{expected}}); } +// Test case for dtype pruning with column order +TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema with partial ordering + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"0", "1"}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"b", "a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + EXPECT_EQ(result.metadata.schema_info[1].name, "a"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// schema with pruned columns and different order. + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + }, + {{"c", "b", "a"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // "c", "b" and "a" order + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "c"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "a"); + // pruned + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 0); + } + //// schema with pruned columns and different sub-order. + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"c", {dtype()}}, + {"b", + {data_type{cudf::type_id::STRUCT}, + // {}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1", "0"}}}}, + {"a", {dtype()}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Order of occurance in json + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // Sub-order of "b" + EXPECT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "1"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "0"); + } + //// schema with 1 dtype, but 2 column order + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// repetition, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"a", "a"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Input schema column order size mismatch with input schema child types + } + //// different column name in order, Error + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + }, + {{"b"}}}; + EXPECT_THROW(in_options.set_dtypes(dtype_schema), std::invalid_argument); + // Column name not found in input schema map, but present in column order and + // prune_columns is enabled + } + // include only one column (nested) + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}, + {{"1"}}}}, + }}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"c", {dtype()}}, + }, + {{"a", "c"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"a", {dtype()}}, + {"d", {dtype()}}, + }, + {{"a", "d"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_bools = + cudf::test::fixed_width_column_wrapper{{true, true, true, true}, {0, 0, 0, 0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_bools); + } + // test struct, list of string, list of struct. + // multiple - not all present nested + { + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }, + {{"2"}}}}, + {"d", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"e", + {data_type{cudf::type_id::LIST}, + {{"element", + { + data_type{cudf::type_id::STRUCT}, + { + {"3", {data_type{cudf::type_id::STRING}}}, + }, //{{"3"}} missing column_order, but output should not have it. + }}}}}, + }, + {{"b", "d", "e"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[0].name, "2"); + EXPECT_EQ(result.metadata.schema_info[1].name, "d"); + auto all_null_strings = cudf::test::strings_column_wrapper{{"", "", "", ""}, {0, 0, 0, 0}}; + EXPECT_EQ(result.tbl->get_column(0).num_children(), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(0), all_null_strings); + auto const all_null_list = cudf::test::lists_column_wrapper{ + {{0, 0}, {1, 1}, {2, 2}, {3, 3}}, cudf::test::iterators::all_nulls()}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), all_null_list); + EXPECT_EQ(result.metadata.schema_info[2].name, "e"); + ASSERT_EQ(result.metadata.schema_info[2].children.size(), 2); + ASSERT_EQ(result.metadata.schema_info[2].children[1].children.size(), 0); + // ASSERT_EQ(result.metadata.schema_info[2].children[1].children[0].name, "3"); + auto empty_string_col = cudf::test::strings_column_wrapper{}; + cudf::test::structs_column_wrapper expected_structs{{}, cudf::test::iterators::all_nulls()}; + // make all null column of list of struct of string + auto wrapped = make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 0}.release(), + expected_structs.release(), + 4, + cudf::create_null_mask(4, cudf::mask_state::ALL_NULL)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); + } + } +} + CUDF_TEST_PROGRAM_MAIN()