Skip to content

Commit

Permalink
[REVIEW] JSON host tree algorithms (#16545)
Browse files Browse the repository at this point in the history
Depends on #16836
This change adds a new host tree building algorithms for JSON reader and utf8 field name support.

This constructs the device_column_tree using an adjacency list created from parent information.
This adjacency list is pruned based on input schema, and also types are enforced as per schema. `mark_is_pruned`
Tree is constructed from pruned adjacency list, (with mixed types handling). `construct_tree`

utf8 field name support added: (spark requested)
utf8 decoding of field names during hashing of field nodes so that utf8 encoded field names also match to same column.

All unit tests passes, 1 unit test added where old algorithm fails.
This code is kept under experimental flag.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #16545
  • Loading branch information
shrshi authored Sep 26, 2024
1 parent c7f6a22 commit 12ee360
Show file tree
Hide file tree
Showing 11 changed files with 1,011 additions and 120 deletions.
36 changes: 36 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class json_reader_options {
char _delimiter = '\n';
// Prune columns on read, selected based on the _dtypes option
bool _prune_columns = false;
// Experimental features: new column tree construction
bool _experimental = false;

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -277,6 +279,15 @@ class json_reader_options {
*/
[[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }

/**
* @brief Whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
* @return true if experimental features are enabled
*/
[[nodiscard]] bool is_enabled_experimental() const { return _experimental; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -453,6 +464,16 @@ class json_reader_options {
*/
void enable_prune_columns(bool val) { _prune_columns = val; }

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
*/
void enable_experimental(bool val) { _experimental = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -695,6 +716,21 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
* @return this for chaining
*/
json_reader_options_builder& experimental(bool val)
{
options._experimental = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
776 changes: 686 additions & 90 deletions cpp/src/io/json/host_tree_algorithms.cu

Large diffs are not rendered by default.

46 changes: 32 additions & 14 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ void print_tree(host_span<SymbolT const> input,
* max row offsets of columns
*/
std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
reduce_to_column_tree(tree_meta_t& tree,
reduce_to_column_tree(tree_meta_t const& tree,
device_span<NodeIndexT const> original_col_ids,
device_span<NodeIndexT const> sorted_col_ids,
device_span<NodeIndexT const> ordered_node_ids,
Expand Down Expand Up @@ -317,14 +317,21 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
// Note: json_col modified here, moves this memory
};

auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
if (schema.has_value()) {
auto const result = schema.value().child_types.find(child_name);
if (result != std::end(schema.value().child_types)) { return result->second; }
}
return {};
};

auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
if (schema.has_value()) {
if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
}
return {};
};

switch (json_col.type) {
case json_col_t::StringColumn: {
// move string_offsets to GPU and transform to string column
Expand Down Expand Up @@ -439,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
rmm::device_buffer{},
0);
// Create children column
auto child_schema_element = json_col.child_columns.empty()
? std::optional<schema_element>{}
: get_child_schema(json_col.child_columns.begin()->first);
auto child_schema_element =
json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
auto [child_column, names] =
json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
? std::pair<std::unique_ptr<column>,
Expand Down Expand Up @@ -479,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
}
}

template <typename... Args>
auto make_device_json_column_dispatch(bool experimental, Args&&... args)
{
if (experimental) {
return experimental::make_device_json_column(std::forward<Args>(args)...);
} else {
return make_device_json_column(std::forward<Args>(args)...);
}
}

table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
gpu_tree,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_experimental(),
stream,
cudf::get_current_device_resource_ref());

Expand All @@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
0);

// Get internal JSON column
make_device_json_column(d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);
make_device_json_column_dispatch(options.is_enabled_experimental(),
d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);

// data_root refers to the root column of the data represented by the given JSON string
auto& data_root =
Expand Down
Loading

0 comments on commit 12ee360

Please sign in to comment.