Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] JSON host tree algorithms #16545

Merged
merged 56 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
9eaacb3
impl
shrshi Aug 13, 2024
27f1cb6
formatting
shrshi Aug 13, 2024
4987f74
added mixed type support
shrshi Aug 13, 2024
65e147f
formatting
shrshi Aug 13, 2024
32e8619
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into host-tre…
karthikeyann Sep 10, 2024
b30c43f
comments - unfinished
karthikeyann Sep 10, 2024
38819f2
very partial work; some comments
shrshi Sep 11, 2024
08cf338
struct column first try, basic tests pass
karthikeyann Sep 16, 2024
85983be
add support for array_of_arrays
karthikeyann Sep 16, 2024
e3fd1d5
fix vector of dtypes in struct json
karthikeyann Sep 17, 2024
dc25011
mixed type as string support added
karthikeyann Sep 18, 2024
d1ec9c7
forced nested type in mixed type data
karthikeyann Sep 18, 2024
ccfc6f6
style fixes
karthikeyann Sep 18, 2024
8fbb1d0
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 18, 2024
ed0b354
cleanup
karthikeyann Sep 18, 2024
c3fcf8a
fix name for list child element as not element
karthikeyann Sep 18, 2024
a700865
reuse code
karthikeyann Sep 18, 2024
217c4d8
reorg code build_tree
karthikeyann Sep 19, 2024
7437653
pulled relevant changes from #16759
karthikeyann Sep 19, 2024
4eff9fc
code reorg: split to 3 functions
karthikeyann Sep 19, 2024
400df4b
split host functions to separate file
karthikeyann Sep 19, 2024
7f5fdf4
split new host algorithm to functions
karthikeyann Sep 19, 2024
10bddb8
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into enh-json…
karthikeyann Sep 19, 2024
3762477
move code
karthikeyann Sep 19, 2024
6c3b681
revert to old call
karthikeyann Sep 19, 2024
ac9fa76
prepare for merge with reorg
karthikeyann Sep 19, 2024
638cb24
Merge branch 'enh-json_code_reorg1' of github.com:karthikeyann/cudf i…
karthikeyann Sep 19, 2024
583c576
fix merge issue
karthikeyann Sep 19, 2024
62085a8
use experimental build_tree
karthikeyann Sep 19, 2024
eab13b3
same code for both make_device_json_column
karthikeyann Sep 19, 2024
1f855b5
add profiling
karthikeyann Sep 19, 2024
c68c259
fix for missmatched forced type left uninitialized
karthikeyann Sep 19, 2024
4efa820
unprune base list in array of arrays when prune is enabled
karthikeyann Sep 20, 2024
69459bd
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 20, 2024
4917115
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into host-tre…
karthikeyann Sep 20, 2024
16f9acd
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into host-tre…
karthikeyann Sep 23, 2024
3694860
add experimental option for new host tree algorithm
karthikeyann Sep 23, 2024
5b1bdf4
remove debug prints
karthikeyann Sep 23, 2024
79364a9
cleanup comments
karthikeyann Sep 23, 2024
be30c60
address review comments
karthikeyann Sep 23, 2024
19f39c2
address review comments
karthikeyann Sep 24, 2024
28ce878
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 24, 2024
5da21d5
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into host-tre…
karthikeyann Sep 24, 2024
833960f
Merge branch 'host-tree-algorithms' of github.com:shrshi/cudf into ho…
karthikeyann Sep 24, 2024
6b501f3
Java JSON APIs experimental option
karthikeyann Sep 24, 2024
7ec6ba1
address review comments
karthikeyann Sep 24, 2024
6f8a4e2
utf8 field name support (experimental)
karthikeyann Sep 24, 2024
8e27ab3
style fixes
karthikeyann Sep 24, 2024
f3ccdfa
stream safety fixes
karthikeyann Sep 24, 2024
2c06379
add more nosync policy
karthikeyann Sep 24, 2024
e5f6d2a
address review comments
karthikeyann Sep 25, 2024
c02193d
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 25, 2024
4dbbaa5
fix order of experimental option
karthikeyann Sep 25, 2024
ea373b6
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 25, 2024
d1cf095
Merge branch 'branch-24.10' into host-tree-algorithms
karthikeyann Sep 25, 2024
0c65921
add missing experimental argument
karthikeyann Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ class json_reader_options {
char _delimiter = '\n';
// Prune columns on read, selected based on the _dtypes option
bool _prune_columns = false;
// Experimental features: new column tree construction
bool _experimental = false;

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -277,6 +279,15 @@ class json_reader_options {
*/
[[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }

/**
* @brief Whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
* @return true if experimental features are enabled
*/
[[nodiscard]] bool is_enabled_experimental() const { return _experimental; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -453,6 +464,16 @@ class json_reader_options {
*/
void enable_prune_columns(bool val) { _prune_columns = val; }

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
*/
void enable_experimental(bool val) { _experimental = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -695,6 +716,21 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to enable experimental features.
*
* When set to true, experimental features, such as the new column tree construction,
* utf-8 matching of field names will be enabled.
*
* @param val Boolean value to enable/disable experimental features
* @return this for chaining
*/
json_reader_options_builder& experimental(bool val)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
{
options._experimental = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
776 changes: 686 additions & 90 deletions cpp/src/io/json/host_tree_algorithms.cu

Large diffs are not rendered by default.

46 changes: 32 additions & 14 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ void print_tree(host_span<SymbolT const> input,
* max row offsets of columns
*/
std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
reduce_to_column_tree(tree_meta_t& tree,
reduce_to_column_tree(tree_meta_t const& tree,
device_span<NodeIndexT const> original_col_ids,
device_span<NodeIndexT const> sorted_col_ids,
device_span<NodeIndexT const> ordered_node_ids,
Expand Down Expand Up @@ -317,14 +317,21 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
// Note: json_col modified here, moves this memory
};

auto get_child_schema = [schema](auto child_name) -> std::optional<schema_element> {
auto get_child_schema = [&schema](auto child_name) -> std::optional<schema_element> {
if (schema.has_value()) {
auto const result = schema.value().child_types.find(child_name);
if (result != std::end(schema.value().child_types)) { return result->second; }
}
return {};
};

auto get_list_child_schema = [&schema]() -> std::optional<schema_element> {
if (schema.has_value()) {
if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second;
}
return {};
};

switch (json_col.type) {
case json_col_t::StringColumn: {
// move string_offsets to GPU and transform to string column
Expand Down Expand Up @@ -439,9 +446,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
rmm::device_buffer{},
0);
// Create children column
auto child_schema_element = json_col.child_columns.empty()
? std::optional<schema_element>{}
: get_child_schema(json_col.child_columns.begin()->first);
auto child_schema_element =
json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
auto [child_column, names] =
json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
? std::pair<std::unique_ptr<column>,
Expand Down Expand Up @@ -479,6 +485,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
}
}

template <typename... Args>
auto make_device_json_column_dispatch(bool experimental, Args&&... args)
{
if (experimental) {
return experimental::make_device_json_column(std::forward<Args>(args)...);
} else {
return make_device_json_column(std::forward<Args>(args)...);
}
}

table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -524,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
gpu_tree,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_experimental(),
stream,
cudf::get_current_device_resource_ref());

Expand All @@ -536,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
0);

// Get internal JSON column
make_device_json_column(d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);
make_device_json_column_dispatch(options.is_enabled_experimental(),
d_input,
gpu_tree,
gpu_col_id,
gpu_row_offsets,
root_column,
is_array_of_arrays,
options,
stream,
mr);

// data_root refers to the root column of the data represented by the given JSON string
auto& data_root =
Expand Down
Loading
Loading