From 1ec96176c95ec6cce2cc024ae4a02a99330d9236 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 11 Jun 2024 19:40:30 +0000 Subject: [PATCH 01/28] added csr data struct --- cpp/src/io/json/json_column.cu | 290 ++++++++++++++++++++++++++++++++ cpp/src/io/json/nested_json.hpp | 30 ++++ 2 files changed, 320 insertions(+) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 631f8adbd6d..3f3c6286045 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -296,6 +297,295 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } +/** + * @brief Returns stable sorted keys and its sorted order + * + * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. + * Since the key and order is returned, using double buffer helps to avoid extra copy to user + * provided output iterator. + * + * @tparam IndexType sorted order type + * @tparam KeyType key type + * @param keys keys to sort + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Sorted keys and indices producing that sorted order + */ +template +std::pair, rmm::device_uvector> stable_sorted_key_order( + cudf::device_span keys, rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + // Determine temporary device storage requirements + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); + + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_buffer, + order_buffer, + keys.size(), + 0, + sizeof(KeyType) * 8, + stream.value()); + + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; +} + +/** + * @brief Reduces node tree representation to column tree CSR representation. + * + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple> +reduce_to_column_tree_csr(tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // 1. column count for allocation + auto const num_columns = + thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); + + rmm::device_uvector unique_node_ids(num_columns, stream); + rmm::device_uvector csr_unique_node_ids(num_columns, stream); + rmm::device_uvector column_levels(num_columns, stream); + thrust::unique_by_key_copy(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_node_ids.begin(), + thrust::make_discard_iterator(), + unique_node_ids.begin()); + thrust::copy_n(rmm::exec_policy(stream), thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), unique_node_ids.size(), column_levels.begin()); + auto [sorted_column_levels, sorted_column_levels_order] = stable_sorted_key_order(column_levels, stream); + + // 2. reduce_by_key {col_id}, {row_offset}, max. + rmm::device_uvector unique_col_ids(num_columns, stream); + rmm::device_uvector max_row_offsets(num_columns, stream); + rmm::device_uvector csr_unique_col_ids(num_columns, stream); + rmm::device_uvector csr_max_row_offsets(num_columns, stream); + auto ordered_row_offsets = + thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); + thrust::reduce_by_key(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_row_offsets, + unique_col_ids.begin(), + max_row_offsets.begin(), + thrust::equal_to(), + thrust::maximum()); + + // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) + rmm::device_uvector column_categories(num_columns, stream); + rmm::device_uvector csr_column_categories(num_columns, stream); + thrust::reduce_by_key( + rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), + unique_col_ids.begin(), + column_categories.begin(), + thrust::equal_to(), + [] __device__(NodeT type_a, NodeT type_b) -> NodeT { + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + return type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + } else if (is_b_leaf) { + return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + } + // *+#=E + return NC_ERR; + }); + + auto csr_permutation_it = thrust::make_zip_iterator(thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(column_categories.begin(), sorted_column_levels_order.begin())); + thrust::copy(rmm::exec_policy(stream), csr_permutation_it, csr_permutation_it + num_columns, thrust::make_zip_iterator(csr_unique_node_ids.begin(), csr_unique_col_ids.begin(), csr_max_row_offsets.begin(), csr_column_categories.begin())); + + // 4. unique_copy parent_node_ids, ranges + rmm::device_uvector csr_parent_col_ids(num_columns, stream); + rmm::device_uvector csr_col_range_begin(num_columns, stream); // Field names + rmm::device_uvector csr_col_range_end(num_columns, stream); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), csr_unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_begin.begin(), csr_unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_end.begin(), csr_unique_node_ids.begin())), + csr_unique_node_ids.size(), + thrust::make_zip_iterator( + csr_parent_col_ids.begin(), csr_col_range_begin.begin(), csr_col_range_end.begin())); + + // convert parent_node_ids to parent_col_ids + thrust::transform( + rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + csr_parent_col_ids.begin(), + [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_id]; + }); + + /* + CSR construction: + 1. Sort column levels and get their ordering + 2. For each column node coln iterated according to sorted_column_levels; do + a. Find nodes that have coln as the parent node -> set adj_coln + b. row idx[coln] = size of adj_coln + 1 + c. col idx[coln] = adj_coln U {parent_col_id[coln]} + */ + + rmm::device_uvector rowidx(num_columns + 1, stream); + thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); + auto [sorted_csr_parent_col_ids, sorted_csr_parent_col_ids_order] = stable_sorted_key_order(csr_parent_col_ids, stream); + rmm::device_uvector non_leaf_nodes(num_columns, stream); + rmm::device_uvector non_leaf_adjacency(num_columns, stream); + thrust::reduce_by_key(rmm::exec_policy(stream), sorted_csr_parent_col_ids.begin(), sorted_csr_parent_col_ids.end(), thrust::make_constant_iterator(1), non_leaf_nodes.begin(), non_leaf_adjacency.begin(), thrust::equal_to()); + // Add the non_leaf_adjacency to rowidx at positions non_leaf_nodes + thrust::transform(rmm::exec_policy(stream), non_leaf_nodes.begin(), non_leaf_nodes.end(), thrust::make_constant_iterator(1), non_leaf_nodes.begin(), thrust::plus()); + thrust::scatter(rmm::exec_policy(stream), non_leaf_adjacency.begin(), non_leaf_adjacency.end(), non_leaf_nodes.begin(), rowidx.begin() + 1); + // We are discarding the parent of the root node. + thrust::transform(rmm::exec_policy(stream), rowidx.begin() + 2, rowidx.end(), thrust::make_constant_iterator(1), rowidx.begin() + 1, thrust::plus()); + thrust::inclusive_scan(rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); + + rmm::device_uvector colidx((num_columns - 1) * 2, stream); + thrust::scatter(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), rowidx.begin() + 1, colidx.begin()); + rmm::device_uvector map((num_columns - 1) * 2, stream); + thrust::sequence(rmm::exec_policy(stream), map.begin(), map.end()); + rmm::device_uvector stencil((num_columns - 1) * 2, stream); + thrust::fill(rmm::exec_policy(stream), stencil.begin(), stencil.end(), 1); + thrust::scatter(rmm::exec_policy(stream), thrust::make_constant_iterator(0), thrust::make_constant_iterator(0) + num_columns, rowidx.begin() + 1, stencil.begin()); + thrust::scatter_if(rmm::exec_policy(stream), sorted_csr_parent_col_ids.begin() + 1, sorted_csr_parent_col_ids.end(), map.begin(), stencil.begin(), colidx.begin()); + + // condition is true if parent is not a list, or sentinel/root + // Special case to return true if parent is a list and is_array_of_arrays is true + auto is_non_list_parent = [column_categories = column_categories.begin(), + is_array_of_arrays, + row_array_parent_col_id] __device__(auto parent_col_id) -> bool { + return !(parent_col_id == parent_node_sentinel || + column_categories[parent_col_id] == NC_LIST && + (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); + }; + // Mixed types in List children go to different columns, + // so all immediate children of list column should have same max_row_offsets. + // create list's children max_row_offsets array. (initialize to zero) + // atomicMax on children max_row_offsets array. + // gather the max_row_offsets from children row offset array. + { + rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); + thrust::fill(rmm::exec_policy(stream), + list_parents_children_max_row_offsets.begin(), + list_parents_children_max_row_offsets.end(), + 0); + thrust::for_each(rmm::exec_policy(stream), + csr_unique_col_ids.begin(), + csr_unique_col_ids.end(), + [csr_column_categories = csr_column_categories.begin(), + csr_parent_col_ids = csr_parent_col_ids.begin(), + csr_max_row_offsets = csr_max_row_offsets.begin(), + list_parents_children_max_row_offsets = + list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { + auto csr_parent_col_id = csr_parent_col_ids[col_id]; + if (csr_parent_col_id != parent_node_sentinel and + csr_column_categories[csr_parent_col_id] == node_t::NC_LIST) { + cuda::atomic_ref ref{ + *(list_parents_children_max_row_offsets + csr_parent_col_id)}; + ref.fetch_max(csr_max_row_offsets[col_id], cuda::std::memory_order_relaxed); + } + }); + thrust::gather_if( + rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + csr_parent_col_ids.begin(), + list_parents_children_max_row_offsets.begin(), + csr_max_row_offsets.begin(), + [csr_column_categories = csr_column_categories.begin()] __device__(size_type parent_col_id) { + return parent_col_id != parent_node_sentinel and + csr_column_categories[parent_col_id] == node_t::NC_LIST; + }); + } + + // copy lists' max_row_offsets to children. + // all structs should have same size. + thrust::transform_if( + rmm::exec_policy(stream), + csr_unique_col_ids.begin(), + csr_unique_col_ids.end(), + csr_max_row_offsets.begin(), + [csr_column_categories = csr_column_categories.begin(), + is_non_list_parent, + csr_parent_col_ids = csr_parent_col_ids.begin(), + csr_max_row_offsets = csr_max_row_offsets.begin()] __device__(size_type col_id) { + auto parent_col_id = csr_parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + while (is_non_list_parent(parent_col_id)) { + col_id = parent_col_id; + parent_col_id = csr_parent_col_ids[parent_col_id]; + } + return csr_max_row_offsets[col_id]; + }, + [csr_column_categories = csr_column_categories.begin(), + is_non_list_parent, + parent_col_ids = csr_parent_col_ids.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + return is_non_list_parent(parent_col_id); + }); + + // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) + thrust::transform_if( + rmm::exec_policy(stream), + csr_col_range_begin.begin(), + csr_col_range_begin.end(), + csr_column_categories.begin(), + csr_col_range_end.begin(), + [] __device__(auto i) { return i + 1; }, + [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + + return std::tuple{column_tree_csr{std::move(rowidx), + std::move(colidx), + std::move(csr_unique_col_ids), + std::move(csr_column_categories), + std::move(csr_col_range_begin), + std::move(csr_col_range_end)}, + std::move(csr_max_row_offsets)}; +} + /** * @brief Get the column indices for the values column for array of arrays rows * diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index e12892a2d50..c557cbd1063 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -44,6 +44,20 @@ struct tree_meta_t { rmm::device_uvector node_range_end; }; +struct column_tree_csr { + //position of nnzs + rmm::device_uvector rowidx; + rmm::device_uvector colidx; + //node properties + rmm::device_uvector column_ids; + rmm::device_uvector categories; + rmm::device_uvector range_begin; + rmm::device_uvector range_end; + std::vector ignore_vals; + std::vector is_mixed_type_column; + std::vector is_pruned; +}; + /** * @brief A column type */ @@ -294,6 +308,22 @@ reduce_to_column_tree(tree_meta_t& tree, device_span row_offsets, rmm::cuda_stream_view stream); +/** + * @brief Reduce node tree into column tree by aggregating each property of column. + * + * @param tree json node tree to reduce (modified in-place, but restored to original state) + * @param col_ids column ids of each node (modified in-place, but restored to original state) + * @param row_offsets row offsets of each node (modified in-place, but restored to original state) + * @param stream The CUDA stream to which kernels are dispatched + * @return A tuple containing the column tree, identifier for each column and the maximum row index + * in each column + */ +std::tuple> +reduce_to_column_tree_csr(tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + rmm::cuda_stream_view stream); + /** * @brief Retrieves the parse_options to be used for type inference and type casting * From 022d7ce15302cf016e8e598aa843e4131895e07f Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 11 Jun 2024 19:47:05 +0000 Subject: [PATCH 02/28] formatting --- cpp/src/io/json/json_column.cu | 108 +++++++++++++++++++++++--------- cpp/src/io/json/nested_json.hpp | 14 ++--- 2 files changed, 87 insertions(+), 35 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3f3c6286045..6e9c590e501 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -360,15 +360,15 @@ std::pair, rmm::device_uvector> stable_s * @return A tuple of column tree representation of JSON string, column ids of columns, and * max row offsets of columns */ -std::tuple> -reduce_to_column_tree_csr(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, - bool is_array_of_arrays, - NodeIndexT const row_array_parent_col_id, - rmm::cuda_stream_view stream) +std::tuple> reduce_to_column_tree_csr( + tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); // 1. column count for allocation @@ -384,8 +384,13 @@ reduce_to_column_tree_csr(tree_meta_t& tree, ordered_node_ids.begin(), thrust::make_discard_iterator(), unique_node_ids.begin()); - thrust::copy_n(rmm::exec_policy(stream), thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), unique_node_ids.size(), column_levels.begin()); - auto [sorted_column_levels, sorted_column_levels_order] = stable_sorted_key_order(column_levels, stream); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), + unique_node_ids.size(), + column_levels.begin()); + auto [sorted_column_levels, sorted_column_levels_order] = + stable_sorted_key_order(column_levels, stream); // 2. reduce_by_key {col_id}, {row_offset}, max. rmm::device_uvector unique_col_ids(num_columns, stream); @@ -432,8 +437,19 @@ reduce_to_column_tree_csr(tree_meta_t& tree, return NC_ERR; }); - auto csr_permutation_it = thrust::make_zip_iterator(thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), thrust::make_permutation_iterator(column_categories.begin(), sorted_column_levels_order.begin())); - thrust::copy(rmm::exec_policy(stream), csr_permutation_it, csr_permutation_it + num_columns, thrust::make_zip_iterator(csr_unique_node_ids.begin(), csr_unique_col_ids.begin(), csr_max_row_offsets.begin(), csr_column_categories.begin())); + auto csr_permutation_it = thrust::make_zip_iterator( + thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(column_categories.begin(), + sorted_column_levels_order.begin())); + thrust::copy(rmm::exec_policy(stream), + csr_permutation_it, + csr_permutation_it + num_columns, + thrust::make_zip_iterator(csr_unique_node_ids.begin(), + csr_unique_col_ids.begin(), + csr_max_row_offsets.begin(), + csr_column_categories.begin())); // 4. unique_copy parent_node_ids, ranges rmm::device_uvector csr_parent_col_ids(num_columns, stream); @@ -471,25 +487,60 @@ reduce_to_column_tree_csr(tree_meta_t& tree, rmm::device_uvector rowidx(num_columns + 1, stream); thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); - auto [sorted_csr_parent_col_ids, sorted_csr_parent_col_ids_order] = stable_sorted_key_order(csr_parent_col_ids, stream); + auto [sorted_csr_parent_col_ids, sorted_csr_parent_col_ids_order] = + stable_sorted_key_order(csr_parent_col_ids, stream); rmm::device_uvector non_leaf_nodes(num_columns, stream); rmm::device_uvector non_leaf_adjacency(num_columns, stream); - thrust::reduce_by_key(rmm::exec_policy(stream), sorted_csr_parent_col_ids.begin(), sorted_csr_parent_col_ids.end(), thrust::make_constant_iterator(1), non_leaf_nodes.begin(), non_leaf_adjacency.begin(), thrust::equal_to()); + thrust::reduce_by_key(rmm::exec_policy(stream), + sorted_csr_parent_col_ids.begin(), + sorted_csr_parent_col_ids.end(), + thrust::make_constant_iterator(1), + non_leaf_nodes.begin(), + non_leaf_adjacency.begin(), + thrust::equal_to()); // Add the non_leaf_adjacency to rowidx at positions non_leaf_nodes - thrust::transform(rmm::exec_policy(stream), non_leaf_nodes.begin(), non_leaf_nodes.end(), thrust::make_constant_iterator(1), non_leaf_nodes.begin(), thrust::plus()); - thrust::scatter(rmm::exec_policy(stream), non_leaf_adjacency.begin(), non_leaf_adjacency.end(), non_leaf_nodes.begin(), rowidx.begin() + 1); + thrust::transform(rmm::exec_policy(stream), + non_leaf_nodes.begin(), + non_leaf_nodes.end(), + thrust::make_constant_iterator(1), + non_leaf_nodes.begin(), + thrust::plus()); + thrust::scatter(rmm::exec_policy(stream), + non_leaf_adjacency.begin(), + non_leaf_adjacency.end(), + non_leaf_nodes.begin(), + rowidx.begin() + 1); // We are discarding the parent of the root node. - thrust::transform(rmm::exec_policy(stream), rowidx.begin() + 2, rowidx.end(), thrust::make_constant_iterator(1), rowidx.begin() + 1, thrust::plus()); - thrust::inclusive_scan(rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); + thrust::transform(rmm::exec_policy(stream), + rowidx.begin() + 2, + rowidx.end(), + thrust::make_constant_iterator(1), + rowidx.begin() + 1, + thrust::plus()); + thrust::inclusive_scan( + rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); rmm::device_uvector colidx((num_columns - 1) * 2, stream); - thrust::scatter(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), rowidx.begin() + 1, colidx.begin()); + thrust::scatter(rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + rowidx.begin() + 1, + colidx.begin()); rmm::device_uvector map((num_columns - 1) * 2, stream); thrust::sequence(rmm::exec_policy(stream), map.begin(), map.end()); rmm::device_uvector stencil((num_columns - 1) * 2, stream); thrust::fill(rmm::exec_policy(stream), stencil.begin(), stencil.end(), 1); - thrust::scatter(rmm::exec_policy(stream), thrust::make_constant_iterator(0), thrust::make_constant_iterator(0) + num_columns, rowidx.begin() + 1, stencil.begin()); - thrust::scatter_if(rmm::exec_policy(stream), sorted_csr_parent_col_ids.begin() + 1, sorted_csr_parent_col_ids.end(), map.begin(), stencil.begin(), colidx.begin()); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0) + num_columns, + rowidx.begin() + 1, + stencil.begin()); + thrust::scatter_if(rmm::exec_policy(stream), + sorted_csr_parent_col_ids.begin() + 1, + sorted_csr_parent_col_ids.end(), + map.begin(), + stencil.begin(), + colidx.begin()); // condition is true if parent is not a list, or sentinel/root // Special case to return true if parent is a list and is_array_of_arrays is true @@ -524,7 +575,8 @@ reduce_to_column_tree_csr(tree_meta_t& tree, csr_column_categories[csr_parent_col_id] == node_t::NC_LIST) { cuda::atomic_ref ref{ *(list_parents_children_max_row_offsets + csr_parent_col_id)}; - ref.fetch_max(csr_max_row_offsets[col_id], cuda::std::memory_order_relaxed); + ref.fetch_max(csr_max_row_offsets[col_id], + cuda::std::memory_order_relaxed); } }); thrust::gather_if( @@ -578,11 +630,11 @@ reduce_to_column_tree_csr(tree_meta_t& tree, [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); return std::tuple{column_tree_csr{std::move(rowidx), - std::move(colidx), - std::move(csr_unique_col_ids), - std::move(csr_column_categories), - std::move(csr_col_range_begin), - std::move(csr_col_range_end)}, + std::move(colidx), + std::move(csr_unique_col_ids), + std::move(csr_column_categories), + std::move(csr_col_range_begin), + std::move(csr_col_range_end)}, std::move(csr_max_row_offsets)}; } diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index c557cbd1063..2997858839a 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -45,10 +45,10 @@ struct tree_meta_t { }; struct column_tree_csr { - //position of nnzs + // position of nnzs rmm::device_uvector rowidx; rmm::device_uvector colidx; - //node properties + // node properties rmm::device_uvector column_ids; rmm::device_uvector categories; rmm::device_uvector range_begin; @@ -318,11 +318,11 @@ reduce_to_column_tree(tree_meta_t& tree, * @return A tuple containing the column tree, identifier for each column and the maximum row index * in each column */ -std::tuple> -reduce_to_column_tree_csr(tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - rmm::cuda_stream_view stream); +std::tuple> reduce_to_column_tree_csr( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + rmm::cuda_stream_view stream); /** * @brief Retrieves the parse_options to be used for type inference and type casting From 382633f816ba1aa34d7bb0be6a1e3881a617fe3a Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 25 Jun 2024 00:55:25 +0000 Subject: [PATCH 03/28] added test --- cpp/src/io/json/json_column.cu | 50 +-------- cpp/src/io/json/json_tree.cu | 51 +--------- cpp/src/io/json/json_utils.hpp | 81 +++++++++++++++ cpp/src/io/json/nested_json.hpp | 19 +++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/json_tree_csr.cu | 174 ++++++++++++++++++++++++++++++++ 6 files changed, 276 insertions(+), 100 deletions(-) create mode 100644 cpp/src/io/json/json_utils.hpp create mode 100644 cpp/tests/io/json_tree_csr.cu diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 6e9c590e501..854c3796245 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -17,6 +17,7 @@ #include "io/utilities/parsing_utils.cuh" #include "io/utilities/string_parsing.hpp" #include "nested_json.hpp" +#include "json_utils.hpp" #include #include @@ -297,55 +298,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Returns stable sorted keys and its sorted order - * - * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. - * Since the key and order is returned, using double buffer helps to avoid extra copy to user - * provided output iterator. - * - * @tparam IndexType sorted order type - * @tparam KeyType key type - * @param keys keys to sort - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return Sorted keys and indices producing that sorted order - */ -template -std::pair, rmm::device_uvector> stable_sorted_key_order( - cudf::device_span keys, rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - - // Determine temporary device storage requirements - rmm::device_uvector keys_buffer1(keys.size(), stream); - rmm::device_uvector keys_buffer2(keys.size(), stream); - rmm::device_uvector order_buffer1(keys.size(), stream); - rmm::device_uvector order_buffer2(keys.size(), stream); - cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); - cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs( - nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); - thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); - - cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_buffer, - order_buffer, - keys.size(), - 0, - sizeof(KeyType) * 8, - stream.value()); - - return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) - : std::move(keys_buffer2), - order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) - : std::move(order_buffer2)}; -} - /** * @brief Reduces node tree representation to column tree CSR representation. * diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index ad807b57766..7d9d926bdd2 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -16,6 +16,7 @@ #include "io/utilities/hostdevice_vector.hpp" #include "nested_json.hpp" +#include "json_utils.hpp" #include #include @@ -33,7 +34,6 @@ #include #include -#include #include #include #include @@ -139,55 +139,6 @@ struct is_nested_end { } }; -/** - * @brief Returns stable sorted keys and its sorted order - * - * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. - * Since the key and order is returned, using double buffer helps to avoid extra copy to user - * provided output iterator. - * - * @tparam IndexType sorted order type - * @tparam KeyType key type - * @param keys keys to sort - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return Sorted keys and indices producing that sorted order - */ -template -std::pair, rmm::device_uvector> stable_sorted_key_order( - cudf::device_span keys, rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - - // Determine temporary device storage requirements - rmm::device_uvector keys_buffer1(keys.size(), stream); - rmm::device_uvector keys_buffer2(keys.size(), stream); - rmm::device_uvector order_buffer1(keys.size(), stream); - rmm::device_uvector order_buffer2(keys.size(), stream); - cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); - cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs( - nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); - thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); - - cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_buffer, - order_buffer, - keys.size(), - 0, - sizeof(KeyType) * 8, - stream.value()); - - return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) - : std::move(keys_buffer2), - order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) - : std::move(order_buffer2)}; -} - /** * @brief Propagate parent node from first sibling to other siblings. * diff --git a/cpp/src/io/json/json_utils.hpp b/cpp/src/io/json/json_utils.hpp new file mode 100644 index 00000000000..80d8f2f9b0f --- /dev/null +++ b/cpp/src/io/json/json_utils.hpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::io::json::detail { +/** + * @brief Returns stable sorted keys and its sorted order + * + * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. + * Since the key and order is returned, using double buffer helps to avoid extra copy to user + * provided output iterator. + * + * @tparam IndexType sorted order type + * @tparam KeyType key type + * @param keys keys to sort + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Sorted keys and indices producing that sorted order + */ +template +std::pair, rmm::device_uvector> stable_sorted_key_order( + cudf::device_span keys, rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + // Determine temporary device storage requirements + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); + + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_buffer, + order_buffer, + keys.size(), + 0, + sizeof(KeyType) * 8, + stream.value()); + + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; +} + +} diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 2997858839a..8374c34db23 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -23,6 +23,8 @@ #include #include +#include +#include #include #include @@ -308,6 +310,16 @@ reduce_to_column_tree(tree_meta_t& tree, device_span row_offsets, rmm::cuda_stream_view stream); +std::tuple, rmm::device_uvector> +reduce_to_column_tree(tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream); + /** * @brief Reduce node tree into column tree by aggregating each property of column. * @@ -318,10 +330,15 @@ reduce_to_column_tree(tree_meta_t& tree, * @return A tuple containing the column tree, identifier for each column and the maximum row index * in each column */ + std::tuple> reduce_to_column_tree_csr( tree_meta_t& tree, - device_span col_ids, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); /** diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c6ab8aa021a..4585b9deacc 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -316,6 +316,7 @@ ConfigureTest( ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp) ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) +ConfigureTest(JSON_TREE_CSR io/json_tree_csr.cu) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp) diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu new file mode 100644 index 00000000000..6a52370080c --- /dev/null +++ b/cpp/tests/io/json_tree_csr.cu @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/json/nested_json.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cuio_json = cudf::io::json; + +struct h_tree_meta_t { + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_range_begin; + std::vector node_range_end; +}; + +struct h_column_tree_csr { + // position of nnzs + std::vector rowidx; + std::vector colidx; + // node properties + std::vector column_ids; + std::vector categories; + std::vector range_begin; + std::vector range_end; +}; + +bool check_equality(cuio_json::tree_meta_t &d_a, cuio_json::column_tree_csr &d_b, rmm::cuda_stream_view stream) { + // convert from tree_meta_t to column_tree_csr + h_tree_meta_t a{ + cudf::detail::make_std_vector_async(d_a.node_categories, stream), + cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), + cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), + cudf::detail::make_std_vector_async(d_a.node_range_end, stream) + }; + + h_column_tree_csr b{ + cudf::detail::make_std_vector_async(d_b.rowidx, stream), + cudf::detail::make_std_vector_async(d_b.colidx, stream), + cudf::detail::make_std_vector_async(d_b.column_ids, stream), + cudf::detail::make_std_vector_async(d_b.categories, stream), + cudf::detail::make_std_vector_async(d_b.range_begin, stream), + cudf::detail::make_std_vector_async(d_b.range_end, stream) + }; + + stream.synchronize(); + + auto num_nodes = a.parent_node_ids.size(); + if(b.rowidx.size() != num_nodes + 1) return false; + + for(auto pos = b.rowidx[0]; pos < b.rowidx[1]; pos++) { + auto v = b.colidx[pos]; + if(a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) return false; + } + for(size_t u = 1; u < num_nodes; u++) { + auto v = b.colidx[b.rowidx[u]]; + if(a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) return false; + for(auto pos = b.rowidx[u] + 1; pos < b.rowidx[u+1]; pos++) { + v = b.colidx[pos]; + if(a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) return false; + } + } + return true; +} + +struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; + +TEST_F(JsonColumnTreeTests, SimpleLines) +{ + auto const stream = cudf::get_default_stream(); + std::string const input = + R"( {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; // Prepare input & output buffers + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + cudf::io::json_reader_options options{}; + options.enable_lines(true); + + // Parse the JSON and get the token stream + auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream( + d_input, options, stream, rmm::mr::get_current_device_resource()); + + // Get the JSON's tree representation + auto gpu_tree = cuio_json::detail::get_tree_representation( + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); + + auto tup = + cuio_json::detail::records_orient_tree_traversal(d_input, + gpu_tree, + false, + options.is_enabled_lines(), + stream, + rmm::mr::get_current_device_resource()); + auto &gpu_col_id = std::get<0>(tup); + auto &gpu_row_offsets = std::get<1>(tup); + + auto const num_nodes = gpu_col_id.size(); + rmm::device_uvector sorted_col_ids(gpu_col_id.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(gpu_col_id.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + cudf::size_type const row_array_parent_col_id = [&]() { + cudf::size_type value = cudf::io::json::parent_node_sentinel; + auto const list_node_index = options.is_enabled_lines() ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + gpu_col_id.data() + list_node_index, + sizeof(cudf::size_type), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + return value; + }(); + + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + cudf::io::json::detail::reduce_to_column_tree(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + false, + row_array_parent_col_id, + stream); + + auto [d_column_tree_csr, d_max_row_offsets_csr] = + cudf::io::json::detail::reduce_to_column_tree_csr(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + false, + row_array_parent_col_id, + stream); + + // assert equality between csr and meta formats + assert(check_equality(d_column_tree, d_column_tree_csr, stream)); +} From 1823854cea1fe1f01953f78e0329931c61596293 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 25 Jun 2024 00:56:27 +0000 Subject: [PATCH 04/28] formatting --- cpp/src/io/json/json_column.cu | 2 +- cpp/src/io/json/json_tree.cu | 2 +- cpp/src/io/json/json_utils.hpp | 9 +-- cpp/src/io/json/nested_json.hpp | 3 +- cpp/tests/io/json_tree_csr.cu | 107 ++++++++++++++++---------------- 5 files changed, 63 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 854c3796245..9fb9e83d08b 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -16,8 +16,8 @@ #include "io/utilities/parsing_utils.cuh" #include "io/utilities/string_parsing.hpp" -#include "nested_json.hpp" #include "json_utils.hpp" +#include "nested_json.hpp" #include #include diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 7d9d926bdd2..5e0d2b389ba 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -15,8 +15,8 @@ */ #include "io/utilities/hostdevice_vector.hpp" -#include "nested_json.hpp" #include "json_utils.hpp" +#include "nested_json.hpp" #include #include diff --git a/cpp/src/io/json/json_utils.hpp b/cpp/src/io/json/json_utils.hpp index 80d8f2f9b0f..8864bde84d8 100644 --- a/cpp/src/io/json/json_utils.hpp +++ b/cpp/src/io/json/json_utils.hpp @@ -16,16 +16,17 @@ #pragma once -#include #include +#include #include #include #include #include -#include -#include #include +#include + +#include #include namespace cudf::io::json::detail { @@ -78,4 +79,4 @@ std::pair, rmm::device_uvector> stable_s : std::move(order_buffer2)}; } -} +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 8374c34db23..bdcc8a223f1 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -22,8 +22,9 @@ #include #include -#include #include +#include + #include #include diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index 6a52370080c..3bd64a8355d 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -22,18 +22,19 @@ #include #include -#include #include +#include #include #include #include #include #include + #include -#include #include +#include namespace cuio_json = cudf::io::json; @@ -55,39 +56,38 @@ struct h_column_tree_csr { std::vector range_end; }; -bool check_equality(cuio_json::tree_meta_t &d_a, cuio_json::column_tree_csr &d_b, rmm::cuda_stream_view stream) { +bool check_equality(cuio_json::tree_meta_t& d_a, + cuio_json::column_tree_csr& d_b, + rmm::cuda_stream_view stream) +{ // convert from tree_meta_t to column_tree_csr - h_tree_meta_t a{ - cudf::detail::make_std_vector_async(d_a.node_categories, stream), - cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), - cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), - cudf::detail::make_std_vector_async(d_a.node_range_end, stream) - }; - - h_column_tree_csr b{ - cudf::detail::make_std_vector_async(d_b.rowidx, stream), - cudf::detail::make_std_vector_async(d_b.colidx, stream), - cudf::detail::make_std_vector_async(d_b.column_ids, stream), - cudf::detail::make_std_vector_async(d_b.categories, stream), - cudf::detail::make_std_vector_async(d_b.range_begin, stream), - cudf::detail::make_std_vector_async(d_b.range_end, stream) - }; + h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream), + cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), + cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), + cudf::detail::make_std_vector_async(d_a.node_range_end, stream)}; + + h_column_tree_csr b{cudf::detail::make_std_vector_async(d_b.rowidx, stream), + cudf::detail::make_std_vector_async(d_b.colidx, stream), + cudf::detail::make_std_vector_async(d_b.column_ids, stream), + cudf::detail::make_std_vector_async(d_b.categories, stream), + cudf::detail::make_std_vector_async(d_b.range_begin, stream), + cudf::detail::make_std_vector_async(d_b.range_end, stream)}; stream.synchronize(); auto num_nodes = a.parent_node_ids.size(); - if(b.rowidx.size() != num_nodes + 1) return false; + if (b.rowidx.size() != num_nodes + 1) return false; - for(auto pos = b.rowidx[0]; pos < b.rowidx[1]; pos++) { + for (auto pos = b.rowidx[0]; pos < b.rowidx[1]; pos++) { auto v = b.colidx[pos]; - if(a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) return false; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) return false; } - for(size_t u = 1; u < num_nodes; u++) { + for (size_t u = 1; u < num_nodes; u++) { auto v = b.colidx[b.rowidx[u]]; - if(a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) return false; - for(auto pos = b.rowidx[u] + 1; pos < b.rowidx[u+1]; pos++) { + if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) return false; + for (auto pos = b.rowidx[u] + 1; pos < b.rowidx[u + 1]; pos++) { v = b.colidx[pos]; - if(a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) return false; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) return false; } } return true; @@ -117,19 +117,20 @@ TEST_F(JsonColumnTreeTests, SimpleLines) auto gpu_tree = cuio_json::detail::get_tree_representation( tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); - auto tup = + auto tup = cuio_json::detail::records_orient_tree_traversal(d_input, - gpu_tree, - false, - options.is_enabled_lines(), - stream, - rmm::mr::get_current_device_resource()); - auto &gpu_col_id = std::get<0>(tup); - auto &gpu_row_offsets = std::get<1>(tup); - - auto const num_nodes = gpu_col_id.size(); + gpu_tree, + false, + options.is_enabled_lines(), + stream, + rmm::mr::get_current_device_resource()); + auto& gpu_col_id = std::get<0>(tup); + auto& gpu_row_offsets = std::get<1>(tup); + + auto const num_nodes = gpu_col_id.size(); rmm::device_uvector sorted_col_ids(gpu_col_id.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin()); + thrust::copy( + rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin()); // sort by {col_id} on {node_ids} stable rmm::device_uvector node_ids(gpu_col_id.size(), stream); @@ -138,7 +139,7 @@ TEST_F(JsonColumnTreeTests, SimpleLines) rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); cudf::size_type const row_array_parent_col_id = [&]() { - cudf::size_type value = cudf::io::json::parent_node_sentinel; + cudf::size_type value = cudf::io::json::parent_node_sentinel; auto const list_node_index = options.is_enabled_lines() ? 0 : 1; CUDF_CUDA_TRY(cudaMemcpyAsync(&value, gpu_col_id.data() + list_node_index, @@ -151,24 +152,24 @@ TEST_F(JsonColumnTreeTests, SimpleLines) auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = cudf::io::json::detail::reduce_to_column_tree(gpu_tree, - gpu_col_id, - sorted_col_ids, - node_ids, - gpu_row_offsets, - false, - row_array_parent_col_id, - stream); + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + false, + row_array_parent_col_id, + stream); auto [d_column_tree_csr, d_max_row_offsets_csr] = cudf::io::json::detail::reduce_to_column_tree_csr(gpu_tree, - gpu_col_id, - sorted_col_ids, - node_ids, - gpu_row_offsets, - false, - row_array_parent_col_id, - stream); - - // assert equality between csr and meta formats + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + false, + row_array_parent_col_id, + stream); + + // assert equality between csr and meta formats assert(check_equality(d_column_tree, d_column_tree_csr, stream)); } From 84a7749066539cd661586bafee4ed5a7c72ed95f Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 28 Jun 2024 17:10:41 +0000 Subject: [PATCH 05/28] fixing csr construction --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/json_column.cu | 292 --------------------------------- cpp/tests/io/json_tree_csr.cu | 16 +- 3 files changed, 16 insertions(+), 293 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fd68bfb26c..2b94e273404 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -387,6 +387,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu + src/io/json/json_column_csr.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 5d0c307a5bc..43c5b10c9a8 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -298,298 +298,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Reduces node tree representation to column tree CSR representation. - * - * @param tree Node tree representation of JSON string - * @param original_col_ids Column ids of nodes - * @param sorted_col_ids Sorted column ids of nodes - * @param ordered_node_ids Node ids of nodes sorted by column ids - * @param row_offsets Row offsets of nodes - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true - * @param stream CUDA stream used for device memory operations and kernel launches - * @return A tuple of column tree representation of JSON string, column ids of columns, and - * max row offsets of columns - */ -std::tuple> reduce_to_column_tree_csr( - tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, - bool is_array_of_arrays, - NodeIndexT const row_array_parent_col_id, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - // 1. column count for allocation - auto const num_columns = - thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); - - rmm::device_uvector unique_node_ids(num_columns, stream); - rmm::device_uvector csr_unique_node_ids(num_columns, stream); - rmm::device_uvector column_levels(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_node_ids.begin(), - thrust::make_discard_iterator(), - unique_node_ids.begin()); - thrust::copy_n( - rmm::exec_policy(stream), - thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), - unique_node_ids.size(), - column_levels.begin()); - auto [sorted_column_levels, sorted_column_levels_order] = - stable_sorted_key_order(column_levels, stream); - - // 2. reduce_by_key {col_id}, {row_offset}, max. - rmm::device_uvector unique_col_ids(num_columns, stream); - rmm::device_uvector max_row_offsets(num_columns, stream); - rmm::device_uvector csr_unique_col_ids(num_columns, stream); - rmm::device_uvector csr_max_row_offsets(num_columns, stream); - auto ordered_row_offsets = - thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); - thrust::reduce_by_key(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_row_offsets, - unique_col_ids.begin(), - max_row_offsets.begin(), - thrust::equal_to(), - thrust::maximum()); - - // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) - rmm::device_uvector column_categories(num_columns, stream); - rmm::device_uvector csr_column_categories(num_columns, stream); - thrust::reduce_by_key( - rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), - unique_col_ids.begin(), - column_categories.begin(), - thrust::equal_to(), - [] __device__(NodeT type_a, NodeT type_b) -> NodeT { - auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); - auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); - // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) - // *+*=*, v+v=v - if (type_a == type_b) { - return type_a; - } else if (is_a_leaf) { - // *+v=*, N+V=N - // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR - return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); - } else if (is_b_leaf) { - return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); - } - // *+#=E - return NC_ERR; - }); - - auto csr_permutation_it = thrust::make_zip_iterator( - thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(column_categories.begin(), - sorted_column_levels_order.begin())); - thrust::copy(rmm::exec_policy(stream), - csr_permutation_it, - csr_permutation_it + num_columns, - thrust::make_zip_iterator(csr_unique_node_ids.begin(), - csr_unique_col_ids.begin(), - csr_max_row_offsets.begin(), - csr_column_categories.begin())); - - // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector csr_parent_col_ids(num_columns, stream); - rmm::device_uvector csr_col_range_begin(num_columns, stream); // Field names - rmm::device_uvector csr_col_range_end(num_columns, stream); - thrust::copy_n( - rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_permutation_iterator(tree.parent_node_ids.begin(), csr_unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_begin.begin(), csr_unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_end.begin(), csr_unique_node_ids.begin())), - csr_unique_node_ids.size(), - thrust::make_zip_iterator( - csr_parent_col_ids.begin(), csr_col_range_begin.begin(), csr_col_range_end.begin())); - - // convert parent_node_ids to parent_col_ids - thrust::transform( - rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - csr_parent_col_ids.begin(), - [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { - return parent_node_id == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_id]; - }); - - /* - CSR construction: - 1. Sort column levels and get their ordering - 2. For each column node coln iterated according to sorted_column_levels; do - a. Find nodes that have coln as the parent node -> set adj_coln - b. row idx[coln] = size of adj_coln + 1 - c. col idx[coln] = adj_coln U {parent_col_id[coln]} - */ - - rmm::device_uvector rowidx(num_columns + 1, stream); - thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); - auto [sorted_csr_parent_col_ids, sorted_csr_parent_col_ids_order] = - stable_sorted_key_order(csr_parent_col_ids, stream); - rmm::device_uvector non_leaf_nodes(num_columns, stream); - rmm::device_uvector non_leaf_adjacency(num_columns, stream); - thrust::reduce_by_key(rmm::exec_policy(stream), - sorted_csr_parent_col_ids.begin(), - sorted_csr_parent_col_ids.end(), - thrust::make_constant_iterator(1), - non_leaf_nodes.begin(), - non_leaf_adjacency.begin(), - thrust::equal_to()); - // Add the non_leaf_adjacency to rowidx at positions non_leaf_nodes - thrust::transform(rmm::exec_policy(stream), - non_leaf_nodes.begin(), - non_leaf_nodes.end(), - thrust::make_constant_iterator(1), - non_leaf_nodes.begin(), - thrust::plus()); - thrust::scatter(rmm::exec_policy(stream), - non_leaf_adjacency.begin(), - non_leaf_adjacency.end(), - non_leaf_nodes.begin(), - rowidx.begin() + 1); - // We are discarding the parent of the root node. - thrust::transform(rmm::exec_policy(stream), - rowidx.begin() + 2, - rowidx.end(), - thrust::make_constant_iterator(1), - rowidx.begin() + 1, - thrust::plus()); - thrust::inclusive_scan( - rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); - - rmm::device_uvector colidx((num_columns - 1) * 2, stream); - thrust::scatter(rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - rowidx.begin() + 1, - colidx.begin()); - rmm::device_uvector map((num_columns - 1) * 2, stream); - thrust::sequence(rmm::exec_policy(stream), map.begin(), map.end()); - rmm::device_uvector stencil((num_columns - 1) * 2, stream); - thrust::fill(rmm::exec_policy(stream), stencil.begin(), stencil.end(), 1); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + num_columns, - rowidx.begin() + 1, - stencil.begin()); - thrust::scatter_if(rmm::exec_policy(stream), - sorted_csr_parent_col_ids.begin() + 1, - sorted_csr_parent_col_ids.end(), - map.begin(), - stencil.begin(), - colidx.begin()); - - // condition is true if parent is not a list, or sentinel/root - // Special case to return true if parent is a list and is_array_of_arrays is true - auto is_non_list_parent = [column_categories = column_categories.begin(), - is_array_of_arrays, - row_array_parent_col_id] __device__(auto parent_col_id) -> bool { - return !(parent_col_id == parent_node_sentinel || - column_categories[parent_col_id] == NC_LIST && - (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); - }; - // Mixed types in List children go to different columns, - // so all immediate children of list column should have same max_row_offsets. - // create list's children max_row_offsets array. (initialize to zero) - // atomicMax on children max_row_offsets array. - // gather the max_row_offsets from children row offset array. - { - rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); - thrust::fill(rmm::exec_policy(stream), - list_parents_children_max_row_offsets.begin(), - list_parents_children_max_row_offsets.end(), - 0); - thrust::for_each(rmm::exec_policy(stream), - csr_unique_col_ids.begin(), - csr_unique_col_ids.end(), - [csr_column_categories = csr_column_categories.begin(), - csr_parent_col_ids = csr_parent_col_ids.begin(), - csr_max_row_offsets = csr_max_row_offsets.begin(), - list_parents_children_max_row_offsets = - list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { - auto csr_parent_col_id = csr_parent_col_ids[col_id]; - if (csr_parent_col_id != parent_node_sentinel and - csr_column_categories[csr_parent_col_id] == node_t::NC_LIST) { - cuda::atomic_ref ref{ - *(list_parents_children_max_row_offsets + csr_parent_col_id)}; - ref.fetch_max(csr_max_row_offsets[col_id], - cuda::std::memory_order_relaxed); - } - }); - thrust::gather_if( - rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - csr_parent_col_ids.begin(), - list_parents_children_max_row_offsets.begin(), - csr_max_row_offsets.begin(), - [csr_column_categories = csr_column_categories.begin()] __device__(size_type parent_col_id) { - return parent_col_id != parent_node_sentinel and - csr_column_categories[parent_col_id] == node_t::NC_LIST; - }); - } - - // copy lists' max_row_offsets to children. - // all structs should have same size. - thrust::transform_if( - rmm::exec_policy(stream), - csr_unique_col_ids.begin(), - csr_unique_col_ids.end(), - csr_max_row_offsets.begin(), - [csr_column_categories = csr_column_categories.begin(), - is_non_list_parent, - csr_parent_col_ids = csr_parent_col_ids.begin(), - csr_max_row_offsets = csr_max_row_offsets.begin()] __device__(size_type col_id) { - auto parent_col_id = csr_parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - while (is_non_list_parent(parent_col_id)) { - col_id = parent_col_id; - parent_col_id = csr_parent_col_ids[parent_col_id]; - } - return csr_max_row_offsets[col_id]; - }, - [csr_column_categories = csr_column_categories.begin(), - is_non_list_parent, - parent_col_ids = csr_parent_col_ids.begin()] __device__(size_type col_id) { - auto parent_col_id = parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - return is_non_list_parent(parent_col_id); - }); - - // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) - thrust::transform_if( - rmm::exec_policy(stream), - csr_col_range_begin.begin(), - csr_col_range_begin.end(), - csr_column_categories.begin(), - csr_col_range_end.begin(), - [] __device__(auto i) { return i + 1; }, - [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); - - return std::tuple{column_tree_csr{std::move(rowidx), - std::move(colidx), - std::move(csr_unique_col_ids), - std::move(csr_column_categories), - std::move(csr_col_range_begin), - std::move(csr_col_range_end)}, - std::move(csr_max_row_offsets)}; -} - /** * @brief Get the column indices for the values column for array of arrays rows * diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index 3bd64a8355d..c20ea938298 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -56,6 +56,14 @@ struct h_column_tree_csr { std::vector range_end; }; +template +void print(std::string str, std::vector &vec) { + std::cout << str << " = "; + for(size_t i = 0; i < vec.size(); i++) + std::cout << vec[i] << " "; + std::cout << std::endl; +} + bool check_equality(cuio_json::tree_meta_t& d_a, cuio_json::column_tree_csr& d_b, rmm::cuda_stream_view stream) @@ -90,6 +98,11 @@ bool check_equality(cuio_json::tree_meta_t& d_a, if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) return false; } } + for (size_t u = 0; u < num_nodes; u++) { + if (a.node_categories[b.column_ids[u]] != b.categories[u]) return false; + if (a.node_range_begin[b.column_ids[u]] != b.range_begin[u]) return false; + if (a.node_range_end[b.column_ids[u]] != b.range_end[u]) return false; + } return true; } @@ -170,6 +183,7 @@ TEST_F(JsonColumnTreeTests, SimpleLines) row_array_parent_col_id, stream); + auto iseq = check_equality(d_column_tree, d_column_tree_csr, stream); // assert equality between csr and meta formats - assert(check_equality(d_column_tree, d_column_tree_csr, stream)); + assert(iseq == true); } From 810c389ff575c9300ffefdf4edacfd352d5352a1 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 28 Jun 2024 17:11:25 +0000 Subject: [PATCH 06/28] moving the csr algorithms --- cpp/src/io/json/json_column_csr.cu | 330 +++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 cpp/src/io/json/json_column_csr.cu diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu new file mode 100644 index 00000000000..c1f58609381 --- /dev/null +++ b/cpp/src/io/json/json_column_csr.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "json_utils.hpp" +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::io::json::detail { + +/** + * @brief Reduces node tree representation to column tree CSR representation. + * + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple> reduce_to_column_tree_csr( + tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // 1. column count for allocation + auto const num_columns = + thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); + + rmm::device_uvector unique_node_ids(num_columns, stream); + rmm::device_uvector csr_unique_node_ids(num_columns, stream); + rmm::device_uvector column_levels(num_columns, stream); + thrust::unique_by_key_copy(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_node_ids.begin(), + thrust::make_discard_iterator(), + unique_node_ids.begin()); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), + unique_node_ids.size(), + column_levels.begin()); + auto [sorted_column_levels, sorted_column_levels_order] = + stable_sorted_key_order(column_levels, stream); + + // 2. reduce_by_key {col_id}, {row_offset}, max. + rmm::device_uvector unique_col_ids(num_columns, stream); + rmm::device_uvector max_row_offsets(num_columns, stream); + rmm::device_uvector csr_unique_col_ids(num_columns, stream); + rmm::device_uvector csr_max_row_offsets(num_columns, stream); + auto ordered_row_offsets = + thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); + thrust::reduce_by_key(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_row_offsets, + unique_col_ids.begin(), + max_row_offsets.begin(), + thrust::equal_to(), + thrust::maximum()); + + // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) + rmm::device_uvector column_categories(num_columns, stream); + rmm::device_uvector csr_column_categories(num_columns, stream); + thrust::reduce_by_key( + rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), + unique_col_ids.begin(), + column_categories.begin(), + thrust::equal_to(), + [] __device__(NodeT type_a, NodeT type_b) -> NodeT { + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + return type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + } else if (is_b_leaf) { + return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + } + // *+#=E + return NC_ERR; + }); + + auto csr_permutation_it = thrust::make_zip_iterator( + thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), + thrust::make_permutation_iterator(column_categories.begin(), + sorted_column_levels_order.begin())); + thrust::copy(rmm::exec_policy(stream), + csr_permutation_it, + csr_permutation_it + num_columns, + thrust::make_zip_iterator(csr_unique_node_ids.begin(), + csr_unique_col_ids.begin(), + csr_max_row_offsets.begin(), + csr_column_categories.begin())); + + // 4. unique_copy parent_node_ids, ranges + rmm::device_uvector csr_parent_col_ids(num_columns, stream); + rmm::device_uvector csr_col_range_begin(num_columns, stream); // Field names + rmm::device_uvector csr_col_range_end(num_columns, stream); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), csr_unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_begin.begin(), csr_unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_end.begin(), csr_unique_node_ids.begin())), + csr_unique_node_ids.size(), + thrust::make_zip_iterator( + csr_parent_col_ids.begin(), csr_col_range_begin.begin(), csr_col_range_end.begin())); + + // convert parent_node_ids to parent_col_ids + thrust::transform( + rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + csr_parent_col_ids.begin(), + [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_id]; + }); + + /* + CSR construction: + 1. Sort column levels and get their ordering + 2. For each column node coln iterated according to sorted_column_levels; do + a. Find nodes that have coln as the parent node -> set adj_coln + b. row idx[coln] = size of adj_coln + 1 + c. col idx[coln] = adj_coln U {parent_col_id[coln]} + */ + + rmm::device_uvector rowidx(num_columns + 1, stream); + thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); + + // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) + // children adjacency + auto num_non_leaf_columns = thrust::unique_count(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end()); + thrust::reduce_by_key(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end(), thrust::make_constant_iterator(1), thrust::make_discard_iterator(), rowidx.begin() + 1, thrust::equal_to()); + thrust::inclusive_scan( + rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); + // overwrite the csr_parent_col_ids with the col ids in the csr tree + thrust::fill(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), -1); + thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_non_leaf_columns, rowidx.begin(), csr_parent_col_ids.begin() + 1); + thrust::inclusive_scan(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), csr_parent_col_ids.begin(), thrust::maximum{}); + // We are discarding the parent of the root node. Add the parent adjacency. Since we have already performed the scan, we use a counting iterator to add + thrust::transform(rmm::exec_policy(stream), + rowidx.begin() + 2, + rowidx.end(), + thrust::make_counting_iterator(1), + rowidx.begin() + 2, + thrust::plus()); + + rmm::device_uvector colidx((num_columns - 1) * 2, stream); + thrust::fill(rmm::exec_policy(stream), colidx.begin(), colidx.end(), 0); + // Skip the parent of root node + thrust::scatter(rmm::exec_policy(stream), + csr_parent_col_ids.begin() + 1, + csr_parent_col_ids.end(), + rowidx.begin() + 1, + colidx.begin()); + // excluding root node + rmm::device_uvector map(num_columns - 1, stream); + thrust::fill(rmm::exec_policy(stream), map.begin(), map.end(), 1); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end(), map.begin(), map.begin()); + thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, + [rowidx = rowidx.begin(), map = map.begin(), csr_parent_col_ids = csr_parent_col_ids.begin()] __device__(auto i) { + auto csr_parent_col_id = csr_parent_col_ids[i]; + if(csr_parent_col_id == 0) map[i - 1]--; + else map[i - 1] += rowidx[csr_parent_col_id]; + }); + thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, map.begin(), colidx.begin()); + + // condition is true if parent is not a list, or sentinel/root + // Special case to return true if parent is a list and is_array_of_arrays is true + auto is_non_list_parent = [column_categories = column_categories.begin(), + is_array_of_arrays, + row_array_parent_col_id] __device__(auto parent_col_id) -> bool { + return !(parent_col_id == parent_node_sentinel || + column_categories[parent_col_id] == NC_LIST && + (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); + }; + // Mixed types in List children go to different columns, + // so all immediate children of list column should have same max_row_offsets. + // create list's children max_row_offsets array. (initialize to zero) + // atomicMax on children max_row_offsets array. + // gather the max_row_offsets from children row offset array. + { + rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); + thrust::fill(rmm::exec_policy(stream), + list_parents_children_max_row_offsets.begin(), + list_parents_children_max_row_offsets.end(), + 0); + thrust::for_each(rmm::exec_policy(stream), + csr_unique_col_ids.begin(), + csr_unique_col_ids.end(), + [csr_column_categories = csr_column_categories.begin(), + csr_parent_col_ids = csr_parent_col_ids.begin(), + csr_max_row_offsets = csr_max_row_offsets.begin(), + list_parents_children_max_row_offsets = + list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { + auto csr_parent_col_id = csr_parent_col_ids[col_id]; + if (csr_parent_col_id != parent_node_sentinel and + csr_column_categories[csr_parent_col_id] == node_t::NC_LIST) { + cuda::atomic_ref ref{ + *(list_parents_children_max_row_offsets + csr_parent_col_id)}; + ref.fetch_max(csr_max_row_offsets[col_id], + cuda::std::memory_order_relaxed); + } + }); + thrust::gather_if( + rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + csr_parent_col_ids.begin(), + list_parents_children_max_row_offsets.begin(), + csr_max_row_offsets.begin(), + [csr_column_categories = csr_column_categories.begin()] __device__(size_type parent_col_id) { + return parent_col_id != parent_node_sentinel and + csr_column_categories[parent_col_id] == node_t::NC_LIST; + }); + } + + // copy lists' max_row_offsets to children. + // all structs should have same size. + thrust::transform_if( + rmm::exec_policy(stream), + csr_unique_col_ids.begin(), + csr_unique_col_ids.end(), + csr_max_row_offsets.begin(), + [csr_column_categories = csr_column_categories.begin(), + is_non_list_parent, + csr_parent_col_ids = csr_parent_col_ids.begin(), + csr_max_row_offsets = csr_max_row_offsets.begin()] __device__(size_type col_id) { + auto parent_col_id = csr_parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + while (is_non_list_parent(parent_col_id)) { + col_id = parent_col_id; + parent_col_id = csr_parent_col_ids[parent_col_id]; + } + return csr_max_row_offsets[col_id]; + }, + [csr_column_categories = csr_column_categories.begin(), + is_non_list_parent, + parent_col_ids = csr_parent_col_ids.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + return is_non_list_parent(parent_col_id); + }); + + // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) + thrust::transform_if( + rmm::exec_policy(stream), + csr_col_range_begin.begin(), + csr_col_range_begin.end(), + csr_column_categories.begin(), + csr_col_range_end.begin(), + [] __device__(auto i) { return i + 1; }, + [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + + return std::tuple{column_tree_csr{std::move(rowidx), + std::move(colidx), + std::move(csr_unique_col_ids), + std::move(csr_column_categories), + std::move(csr_col_range_begin), + std::move(csr_col_range_end)}, + std::move(csr_max_row_offsets)}; +} + +} // namespace cudf::io::json::detail From 6a1a415ec1f4a758cb650da0cd3cf3ef332ea6ff Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 28 Jun 2024 17:13:35 +0000 Subject: [PATCH 07/28] formatting --- cpp/src/io/json/json_column_csr.cu | 56 +++++++++++++++++++++++------- cpp/tests/io/json_tree_csr.cu | 5 +-- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index c1f58609381..dbb32f278f0 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -195,15 +195,31 @@ std::tuple> reduce_to_column_tre // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) // children adjacency - auto num_non_leaf_columns = thrust::unique_count(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end()); - thrust::reduce_by_key(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end(), thrust::make_constant_iterator(1), thrust::make_discard_iterator(), rowidx.begin() + 1, thrust::equal_to()); + auto num_non_leaf_columns = thrust::unique_count( + rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end()); + thrust::reduce_by_key(rmm::exec_policy(stream), + csr_parent_col_ids.begin() + 1, + csr_parent_col_ids.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + rowidx.begin() + 1, + thrust::equal_to()); thrust::inclusive_scan( rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); // overwrite the csr_parent_col_ids with the col ids in the csr tree thrust::fill(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), -1); - thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_non_leaf_columns, rowidx.begin(), csr_parent_col_ids.begin() + 1); - thrust::inclusive_scan(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), csr_parent_col_ids.begin(), thrust::maximum{}); - // We are discarding the parent of the root node. Add the parent adjacency. Since we have already performed the scan, we use a counting iterator to add + thrust::scatter(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_non_leaf_columns, + rowidx.begin(), + csr_parent_col_ids.begin() + 1); + thrust::inclusive_scan(rmm::exec_policy(stream), + csr_parent_col_ids.begin(), + csr_parent_col_ids.end(), + csr_parent_col_ids.begin(), + thrust::maximum{}); + // We are discarding the parent of the root node. Add the parent adjacency. Since we have already + // performed the scan, we use a counting iterator to add thrust::transform(rmm::exec_policy(stream), rowidx.begin() + 2, rowidx.end(), @@ -222,14 +238,28 @@ std::tuple> reduce_to_column_tre // excluding root node rmm::device_uvector map(num_columns - 1, stream); thrust::fill(rmm::exec_policy(stream), map.begin(), map.end(), 1); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end(), map.begin(), map.begin()); - thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, - [rowidx = rowidx.begin(), map = map.begin(), csr_parent_col_ids = csr_parent_col_ids.begin()] __device__(auto i) { - auto csr_parent_col_id = csr_parent_col_ids[i]; - if(csr_parent_col_id == 0) map[i - 1]--; - else map[i - 1] += rowidx[csr_parent_col_id]; - }); - thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, map.begin(), colidx.begin()); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + csr_parent_col_ids.begin() + 1, + csr_parent_col_ids.end(), + map.begin(), + map.begin()); + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(1) + num_columns - 1, + [rowidx = rowidx.begin(), + map = map.begin(), + csr_parent_col_ids = csr_parent_col_ids.begin()] __device__(auto i) { + auto csr_parent_col_id = csr_parent_col_ids[i]; + if (csr_parent_col_id == 0) + map[i - 1]--; + else + map[i - 1] += rowidx[csr_parent_col_id]; + }); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(1) + num_columns - 1, + map.begin(), + colidx.begin()); // condition is true if parent is not a list, or sentinel/root // Special case to return true if parent is a list and is_array_of_arrays is true diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index c20ea938298..10735921331 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -57,9 +57,10 @@ struct h_column_tree_csr { }; template -void print(std::string str, std::vector &vec) { +void print(std::string str, std::vector& vec) +{ std::cout << str << " = "; - for(size_t i = 0; i < vec.size(); i++) + for (size_t i = 0; i < vec.size(); i++) std::cout << vec[i] << " "; std::cout << std::endl; } From 36751400c99001df65cce64a42a2c5a501a69138 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 5 Jul 2024 23:58:12 +0000 Subject: [PATCH 08/28] ignoring leaf nodes with non-leaf siblings --- cpp/src/io/json/json_column_csr.cu | 123 +++++++++++++++++++++++++++++ cpp/src/io/json/nested_json.hpp | 100 +++++++++++++++-------- 2 files changed, 191 insertions(+), 32 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index dbb32f278f0..4c66e175bc0 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -34,6 +34,8 @@ #include #include +#include +#include #include #include #include @@ -357,4 +359,125 @@ std::tuple> reduce_to_column_tre std::move(csr_max_row_offsets)}; } +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column_csr(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + bool const is_enabled_lines = options.is_enabled_lines(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = [&]() { + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; + }(); + + // 1. gather column information. + auto [d_column_tree, d_max_row_offsets] = + reduce_to_column_tree_csr(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + CUDF_EXPECTS(is_array_of_arrays == false, "array of arrays has not yet been implemented"); + CUDF_EXPECTS(options.is_enabled_mixed_types_as_string() == false, "mixed type as string has not yet been implemented"); + CUDF_EXPECTS(options.is_enabled_prune_columns() == false, "column pruning has not yet been implemented"); + + // traverse the column tree + auto num_columns = d_column_tree.rowidx.size() - 1; + d_column_tree.is_mixed_type_column.resize(num_columns, 0); + d_column_tree.is_pruned.resize(num_columns, 0); + + // for ignore_vals, we need to identify leaf nodes that have non-leaf sibling nodes + // i.e. we need to ignore leaf nodes at level above the last level + // idea: leaf nodes have adjacency 1. So if there is an adjacency 1 inbetween non-one + // adjacencies, then found the leaf node. Corner case: consider the last set of consecutive + // ones. If the leftmost of those ones (say node u) has a non-leaf sibling + // (can be found by looking at the adjacencies of the siblings + // (which are in turn found from the colidx of the parent u), then this leaf node should be + // ignored, otherwise all good. + rmm::device_uvector adjacency(num_columns + 1, stream); // since adjacent_difference requires that the output have the same length as input + thrust::adjacent_difference(rmm::exec_policy(stream), d_column_tree.rowidx.begin(), d_column_tree.rowidx.end(), adjacency.begin()); + auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), adjacency.begin() + 1, adjacency.end(), + [] __device__ (auto const adj) { + return adj == 1; + }); + rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); + thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_columns, leaf_nodes, + [adjacency = adjacency.begin()] __device__ (size_t node) { + return adjacency[node] == 1; + }); + + auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); + auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); + auto is_leftmost_leaf = thrust::mismatch(rmm::exec_policy(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); + // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second - 1) + NodeIndexT leftmost_leaf_node = leaf_nodes.element(num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); + + // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes of v are non-leaf i.e + // check if u is the first child of v. If yes, then leafmost_leaf_node is the leftmost leaf node. Otherwise, discard all + // children of v after and including u + auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), d_column_tree.rowidx.begin(), d_column_tree.rowidx.end(), leftmost_leaf_node); + NodeIndexT parent = thrust::distance(d_column_tree.rowidx.begin(), parent_it - 1); + NodeIndexT parent_adj_start = d_column_tree.rowidx.element(parent, stream); + NodeIndexT parent_adj_end = d_column_tree.rowidx.element(parent + 1, stream); + auto childnum_it = thrust::lower_bound(rmm::exec_policy(stream), d_column_tree.colidx.begin() + parent_adj_start, d_column_tree.colidx.begin() + parent_adj_end, leftmost_leaf_node); + + auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1; + if(childnum_it != d_column_tree.colidx.begin() + parent_adj_start + 1) { + // discarding from u to last child of parent + retained_leaf_nodes_it += thrust::distance(childnum_it, d_column_tree.colidx.begin() + parent_adj_end); + } + + // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they are part of ignore_vals + // but we cannot resize a device_uvector, what else can we do? + +} + } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index bdcc8a223f1..f0296fc9ab4 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -47,43 +47,11 @@ struct tree_meta_t { rmm::device_uvector node_range_end; }; -struct column_tree_csr { - // position of nnzs - rmm::device_uvector rowidx; - rmm::device_uvector colidx; - // node properties - rmm::device_uvector column_ids; - rmm::device_uvector categories; - rmm::device_uvector range_begin; - rmm::device_uvector range_end; - std::vector ignore_vals; - std::vector is_mixed_type_column; - std::vector is_pruned; -}; - /** * @brief A column type */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; -/** - * @brief Enum class to specify whether we just push onto and pop from the stack or whether we also - * reset to an empty stack on a newline character. - */ -enum class stack_behavior_t : char { - /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop - /// from the stack - PushPopWithoutReset, - - /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop - /// from the stack. Delimiter characters are passed when the stack context is constructed to - /// reset to an empty stack. - ResetOnDelimiter -}; - -// Default name for a list's child column -constexpr auto list_child_name{"element"}; - /** * @brief Intermediate representation of data from a nested JSON input */ @@ -200,6 +168,63 @@ struct device_json_column { } }; +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + + +struct column_tree_csr { + // position of nnzs + rmm::device_uvector rowidx; + rmm::device_uvector colidx; + // node properties + rmm::device_uvector column_ids; + rmm::device_uvector categories; + rmm::device_uvector range_begin; + rmm::device_uvector range_end; + std::vector ignore_vals; + std::vector is_mixed_type_column; + std::vector is_pruned; + // device_json_column properties + // Type used to count number of rows + using row_offset_t = size_type; + // The inferred type of this column (list, struct, or value/string column) + std::vector types; + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; + // Row offsets + rmm::device_uvector child_offsets; + // Validity bitmap + rmm::device_buffer validity; + std::vector num_rows; +}; + +/** + * @brief Enum class to specify whether we just push onto and pop from the stack or whether we also + * reset to an empty stack on a newline character. + */ +enum class stack_behavior_t : char { + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack + PushPopWithoutReset, + + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack. Delimiter characters are passed when the stack context is constructed to + /// reset to an empty stack. + ResetOnDelimiter +}; + +// Default name for a list's child column +constexpr auto list_child_name{"element"}; + namespace detail { // TODO: return device_uvector instead of passing pre-allocated memory @@ -342,6 +367,17 @@ std::tuple> reduce_to_column_tre NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); +void make_device_json_column_csr(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + /** * @brief Retrieves the parse_options to be used for type inference and type casting * From 389df505de96a1779e02422937b76a77c996293c Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sat, 6 Jul 2024 00:14:33 +0000 Subject: [PATCH 09/28] formatting --- cpp/src/io/json/json_column_csr.cu | 123 +++++++++++++++++------------ cpp/src/io/json/nested_json.hpp | 20 ++--- 2 files changed, 81 insertions(+), 62 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 4c66e175bc0..c4b9f7c5b3c 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -379,18 +379,18 @@ std::tuple> reduce_to_column_tre * of child_offets and validity members of `d_json_column` */ void make_device_json_column_csr(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - bool const is_enabled_lines = options.is_enabled_lines(); - auto const num_nodes = col_ids.size(); + bool const is_enabled_lines = options.is_enabled_lines(); + auto const num_nodes = col_ids.size(); rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); @@ -415,19 +415,20 @@ void make_device_json_column_csr(device_span input, }(); // 1. gather column information. - auto [d_column_tree, d_max_row_offsets] = - reduce_to_column_tree_csr(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); + auto [d_column_tree, d_max_row_offsets] = reduce_to_column_tree_csr(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); CUDF_EXPECTS(is_array_of_arrays == false, "array of arrays has not yet been implemented"); - CUDF_EXPECTS(options.is_enabled_mixed_types_as_string() == false, "mixed type as string has not yet been implemented"); - CUDF_EXPECTS(options.is_enabled_prune_columns() == false, "column pruning has not yet been implemented"); + CUDF_EXPECTS(options.is_enabled_mixed_types_as_string() == false, + "mixed type as string has not yet been implemented"); + CUDF_EXPECTS(options.is_enabled_prune_columns() == false, + "column pruning has not yet been implemented"); // traverse the column tree auto num_columns = d_column_tree.rowidx.size() - 1; @@ -438,46 +439,64 @@ void make_device_json_column_csr(device_span input, // i.e. we need to ignore leaf nodes at level above the last level // idea: leaf nodes have adjacency 1. So if there is an adjacency 1 inbetween non-one // adjacencies, then found the leaf node. Corner case: consider the last set of consecutive - // ones. If the leftmost of those ones (say node u) has a non-leaf sibling - // (can be found by looking at the adjacencies of the siblings - // (which are in turn found from the colidx of the parent u), then this leaf node should be + // ones. If the leftmost of those ones (say node u) has a non-leaf sibling + // (can be found by looking at the adjacencies of the siblings + // (which are in turn found from the colidx of the parent u), then this leaf node should be // ignored, otherwise all good. - rmm::device_uvector adjacency(num_columns + 1, stream); // since adjacent_difference requires that the output have the same length as input - thrust::adjacent_difference(rmm::exec_policy(stream), d_column_tree.rowidx.begin(), d_column_tree.rowidx.end(), adjacency.begin()); - auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), adjacency.begin() + 1, adjacency.end(), - [] __device__ (auto const adj) { - return adj == 1; - }); + rmm::device_uvector adjacency( + num_columns + 1, + stream); // since adjacent_difference requires that the output have the same length as input + thrust::adjacent_difference(rmm::exec_policy(stream), + d_column_tree.rowidx.begin(), + d_column_tree.rowidx.end(), + adjacency.begin()); + auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), + adjacency.begin() + 1, + adjacency.end(), + [] __device__(auto const adj) { return adj == 1; }); rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); - thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_columns, leaf_nodes, - [adjacency = adjacency.begin()] __device__ (size_t node) { - return adjacency[node] == 1; - }); + thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + leaf_nodes.begin(), + [adjacency = adjacency.begin()] __device__(size_t node) { return adjacency[node] == 1; }); auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); - auto is_leftmost_leaf = thrust::mismatch(rmm::exec_policy(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); - // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second - 1) - NodeIndexT leftmost_leaf_node = leaf_nodes.element(num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); - - // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes of v are non-leaf i.e - // check if u is the first child of v. If yes, then leafmost_leaf_node is the leftmost leaf node. Otherwise, discard all - // children of v after and including u - auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), d_column_tree.rowidx.begin(), d_column_tree.rowidx.end(), leftmost_leaf_node); - NodeIndexT parent = thrust::distance(d_column_tree.rowidx.begin(), parent_it - 1); + auto is_leftmost_leaf = thrust::mismatch( + rmm::exec_policy(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); + // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second + // - 1) + NodeIndexT leftmost_leaf_node = leaf_nodes.element( + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); + + // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes + // of v are non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is + // the leftmost leaf node. Otherwise, discard all children of v after and including u + auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), + d_column_tree.rowidx.begin(), + d_column_tree.rowidx.end(), + leftmost_leaf_node); + NodeIndexT parent = thrust::distance(d_column_tree.rowidx.begin(), parent_it - 1); NodeIndexT parent_adj_start = d_column_tree.rowidx.element(parent, stream); - NodeIndexT parent_adj_end = d_column_tree.rowidx.element(parent + 1, stream); - auto childnum_it = thrust::lower_bound(rmm::exec_policy(stream), d_column_tree.colidx.begin() + parent_adj_start, d_column_tree.colidx.begin() + parent_adj_end, leftmost_leaf_node); - - auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1; - if(childnum_it != d_column_tree.colidx.begin() + parent_adj_start + 1) { + NodeIndexT parent_adj_end = d_column_tree.rowidx.element(parent + 1, stream); + auto childnum_it = thrust::lower_bound(rmm::exec_policy(stream), + d_column_tree.colidx.begin() + parent_adj_start, + d_column_tree.colidx.begin() + parent_adj_end, + leftmost_leaf_node); + + auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - + thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - + 1; + if (childnum_it != d_column_tree.colidx.begin() + parent_adj_start + 1) { // discarding from u to last child of parent - retained_leaf_nodes_it += thrust::distance(childnum_it, d_column_tree.colidx.begin() + parent_adj_end); + retained_leaf_nodes_it += + thrust::distance(childnum_it, d_column_tree.colidx.begin() + parent_adj_end); } - // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they are part of ignore_vals - // but we cannot resize a device_uvector, what else can we do? - + // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they + // are part of ignore_vals } } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index f0296fc9ab4..aeb06574d42 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -180,7 +180,6 @@ struct json_column_data { bitmask_type* validity; }; - struct column_tree_csr { // position of nnzs rmm::device_uvector rowidx; @@ -195,6 +194,7 @@ struct column_tree_csr { std::vector is_pruned; // device_json_column properties // Type used to count number of rows + /* using row_offset_t = size_type; // The inferred type of this column (list, struct, or value/string column) std::vector types; @@ -205,6 +205,7 @@ struct column_tree_csr { // Validity bitmap rmm::device_buffer validity; std::vector num_rows; + */ }; /** @@ -368,15 +369,14 @@ std::tuple> reduce_to_column_tre rmm::cuda_stream_view stream); void make_device_json_column_csr(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Retrieves the parse_options to be used for type inference and type casting From 4bba629cc5d93f9c578b1503281a49b0f0985142 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 15 Jul 2024 17:26:31 +0000 Subject: [PATCH 10/28] moving to experimental namespace --- cpp/src/io/json/json_column.cu | 1 - cpp/src/io/json/json_column_csr.cu | 4 +- cpp/src/io/json/nested_json.hpp | 74 ++++++++++++++++-------------- cpp/tests/io/json_tree_csr.cu | 4 +- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 43c5b10c9a8..ca8466a22b4 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index dbb32f278f0..8e7233fa148 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -49,7 +49,7 @@ #include #include -namespace cudf::io::json::detail { +namespace cudf::io::json::experimental::detail { /** * @brief Reduces node tree representation to column tree CSR representation. @@ -95,7 +95,7 @@ std::tuple> reduce_to_column_tre unique_node_ids.size(), column_levels.begin()); auto [sorted_column_levels, sorted_column_levels_order] = - stable_sorted_key_order(column_levels, stream); + cudf::io::json::detail::stable_sorted_key_order(column_levels, stream); // 2. reduce_by_key {col_id}, {row_offset}, max. rmm::device_uvector unique_col_ids(num_columns, stream); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index bdcc8a223f1..16dc9d63b2f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -47,19 +47,6 @@ struct tree_meta_t { rmm::device_uvector node_range_end; }; -struct column_tree_csr { - // position of nnzs - rmm::device_uvector rowidx; - rmm::device_uvector colidx; - // node properties - rmm::device_uvector column_ids; - rmm::device_uvector categories; - rmm::device_uvector range_begin; - rmm::device_uvector range_end; - std::vector ignore_vals; - std::vector is_mixed_type_column; - std::vector is_pruned; -}; /** * @brief A column type @@ -200,6 +187,46 @@ struct device_json_column { } }; +namespace experimental { +struct column_tree_csr { + // position of nnzs + rmm::device_uvector rowidx; + rmm::device_uvector colidx; + // node properties + rmm::device_uvector column_ids; + rmm::device_uvector categories; + rmm::device_uvector range_begin; + rmm::device_uvector range_end; + std::vector ignore_vals; + std::vector is_mixed_type_column; + std::vector is_pruned; +}; + +namespace detail { +/** + * @brief Reduce node tree into column tree by aggregating each property of column. + * + * @param tree json node tree to reduce (modified in-place, but restored to original state) + * @param col_ids column ids of each node (modified in-place, but restored to original state) + * @param row_offsets row offsets of each node (modified in-place, but restored to original state) + * @param stream The CUDA stream to which kernels are dispatched + * @return A tuple containing the column tree, identifier for each column and the maximum row index + * in each column + */ + +std::tuple> reduce_to_column_tree_csr( + tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream); + +} +} + namespace detail { // TODO: return device_uvector instead of passing pre-allocated memory @@ -321,27 +348,6 @@ reduce_to_column_tree(tree_meta_t& tree, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); -/** - * @brief Reduce node tree into column tree by aggregating each property of column. - * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column - */ - -std::tuple> reduce_to_column_tree_csr( - tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, - bool is_array_of_arrays, - NodeIndexT const row_array_parent_col_id, - rmm::cuda_stream_view stream); - /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index 10735921331..e73e4f2a629 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -66,7 +66,7 @@ void print(std::string str, std::vector& vec) } bool check_equality(cuio_json::tree_meta_t& d_a, - cuio_json::column_tree_csr& d_b, + cuio_json::experimental::column_tree_csr& d_b, rmm::cuda_stream_view stream) { // convert from tree_meta_t to column_tree_csr @@ -175,7 +175,7 @@ TEST_F(JsonColumnTreeTests, SimpleLines) stream); auto [d_column_tree_csr, d_max_row_offsets_csr] = - cudf::io::json::detail::reduce_to_column_tree_csr(gpu_tree, + cudf::io::json::experimental::detail::reduce_to_column_tree_csr(gpu_tree, gpu_col_id, sorted_col_ids, node_ids, From df9e65b2fa03a282a740f936b7810345180650f1 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 15 Jul 2024 17:39:35 +0000 Subject: [PATCH 11/28] formatting --- cpp/src/io/json/json_column_csr.cu | 2 +- cpp/src/io/json/nested_json.hpp | 5 ++--- cpp/tests/io/json_tree_csr.cu | 14 +++++++------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 8e7233fa148..ee65dbc3bc8 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -357,4 +357,4 @@ std::tuple> reduce_to_column_tre std::move(csr_max_row_offsets)}; } -} // namespace cudf::io::json::detail +} // namespace cudf::io::json::experimental::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 16dc9d63b2f..386b55ed2a0 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -47,7 +47,6 @@ struct tree_meta_t { rmm::device_uvector node_range_end; }; - /** * @brief A column type */ @@ -224,8 +223,8 @@ std::tuple> reduce_to_column_tre NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); -} -} +} // namespace detail +} // namespace experimental namespace detail { diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index e73e4f2a629..12e92551521 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -176,13 +176,13 @@ TEST_F(JsonColumnTreeTests, SimpleLines) auto [d_column_tree_csr, d_max_row_offsets_csr] = cudf::io::json::experimental::detail::reduce_to_column_tree_csr(gpu_tree, - gpu_col_id, - sorted_col_ids, - node_ids, - gpu_row_offsets, - false, - row_array_parent_col_id, - stream); + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + false, + row_array_parent_col_id, + stream); auto iseq = check_equality(d_column_tree, d_column_tree_csr, stream); // assert equality between csr and meta formats From d1588c886e7f46c6b8712c8ebd75e96fadcae8e4 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 15 Jul 2024 17:53:41 +0000 Subject: [PATCH 12/28] removed node properties from csr struct - will be introduced in stages in later PRs --- cpp/src/io/json/nested_json.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 386b55ed2a0..20019a703c9 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -196,9 +196,6 @@ struct column_tree_csr { rmm::device_uvector categories; rmm::device_uvector range_begin; rmm::device_uvector range_end; - std::vector ignore_vals; - std::vector is_mixed_type_column; - std::vector is_pruned; }; namespace detail { From b8097034b3cb9032ec8cb4147c10e6f53e03263e Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 16 Jul 2024 19:06:11 +0000 Subject: [PATCH 13/28] partial commit before merge --- cpp/src/io/json/json_column_csr.cu | 175 ++++++++++++++++++----------- cpp/src/io/json/nested_json.hpp | 1 + 2 files changed, 109 insertions(+), 67 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index c4b9f7c5b3c..1f40b14671c 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -53,6 +53,110 @@ namespace cudf::io::json::detail { +struct device_json_column_properties_size { + rmm::device_uvector outcol_nodes; + size_t string_offsets_size = 0; + size_t string_lengths_size = 0; + size_t child_offsets_size = 0; + size_t num_rows_size = 0; +}; + +device_json_column_properties_size estimate_device_json_column_size(rmm::device_uvector const &rowidx, rmm::device_uvector const &colidx, rmm::device_uvector const &categories, cudf::io::json_reader_options reader_options, rmm::cuda_stream_view stream) { + // What are the cases in which estimation works? + CUDF_EXPECTS(reader_options.is_enabled_mixed_types_as_string() == false, + "mixed type as string has not yet been implemented"); + CUDF_EXPECTS(reader_options.is_enabled_prune_columns() == false, + "column pruning has not yet been implemented"); + // traverse the column tree + auto num_columns = rowidx.size() - 1; + + // 1. removing NC_ERR nodes and their descendants i.e. + // removing the entire subtree rooted at the nodes with category NC_ERR + auto num_err_nodes = thrust::count_if(rmm::exec_policy(stream), + categories.begin(), + categories.end(), + [] __device__(auto const ctg) { return ctg == NC_ERR; }); + + // (Optional) 2. Let's do some validation of the column tree based on some of its properties. + // We will be using these properties to filter nodes later on + // =========================================================================== + // Every node v is of type string, val, field name, list or struct. + // String and val cannot have any children. + // If v is a field name, it can have struct, list, string and val as children. + // If v is a struct, it can have a field name as child + // If v is a list, it can have string, val, list or struct as child + // There can only be at most one string and one val child for a given node, but many struct, list and field name children. + // Moreover, only string and val children can be leaf nodes. + // When mixed type support is disabled - + // 1. A mix of lists and structs in the same column is not supported i.e a field name and list node cannot have both list and struct as children + // 2. If there is a mix of str/val and list/struct in the same column, then str/val is discarded + + // for ignore_vals, we need to identify leaf nodes that have non-leaf sibling nodes + // i.e. we need to ignore leaf nodes at level above the last level + // idea: leaf nodes have adjacency 1. So if there is an adjacency 1 inbetween non-one + // adjacencies, then found the leaf node. Corner case: consider the last set of consecutive + // ones. If the leftmost of those ones (say node u) has a non-leaf sibling + // (can be found by looking at the adjacencies of the siblings + // (which are in turn found from the colidx of the parent u), then this leaf node should be + // ignored, otherwise all good. + rmm::device_uvector adjacency( + num_columns + 1, + stream); // since adjacent_difference requires that the output have the same length as input + thrust::adjacent_difference(rmm::exec_policy(stream), + rowidx.begin(), + rowidx.end(), + adjacency.begin()); + auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), + adjacency.begin() + 1, + adjacency.end(), + [] __device__(auto const adj) { return adj == 1; }); + rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); + thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + leaf_nodes.begin(), + [adjacency = adjacency.begin()] __device__(size_t node) { return adjacency[node] == 1; }); + + auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); + auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); + auto is_leftmost_leaf = thrust::mismatch( + rmm::exec_policy(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); + // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second + // - 1) + NodeIndexT leftmost_leaf_node = leaf_nodes.element( + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); + + // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes + // of v is non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is + // the leftmost leaf node. Otherwise, discard all children of v after and including u + + auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), + rowidx.begin(), + rowidx.end(), + leftmost_leaf_node); + NodeIndexT parent = thrust::distance(rowidx.begin(), parent_it - 1); + NodeIndexT parent_adj_start = rowidx.element(parent, stream); + NodeIndexT parent_adj_end = rowidx.element(parent + 1, stream); + auto childnum_it = thrust::lower_bound(rmm::exec_policy(stream), + colidx.begin() + parent_adj_start, + colidx.begin() + parent_adj_end, + leftmost_leaf_node); + + auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - + thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - + 1; + if (childnum_it != colidx.begin() + parent_adj_start + 1) { + // discarding from u to last child of parent + retained_leaf_nodes_it += + thrust::distance(childnum_it, colidx.begin() + parent_adj_end); + } + // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they + // are part of ignore_vals + + +} + /** * @brief Reduces node tree representation to column tree CSR representation. * @@ -75,6 +179,7 @@ std::tuple> reduce_to_column_tre device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, + cudf::io::json_reader_options const& reader_options, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -350,6 +455,8 @@ std::tuple> reduce_to_column_tre [] __device__(auto i) { return i + 1; }, [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + auto size_estimates = estimate_device_json_column_size(rowidx, colidx, csr_column_categories, reader_options, stream); + return std::tuple{column_tree_csr{std::move(rowidx), std::move(colidx), std::move(csr_unique_col_ids), @@ -422,6 +529,7 @@ void make_device_json_column_csr(device_span input, row_offsets, is_array_of_arrays, row_array_parent_col_id, + options, stream); CUDF_EXPECTS(is_array_of_arrays == false, "array of arrays has not yet been implemented"); @@ -430,73 +538,6 @@ void make_device_json_column_csr(device_span input, CUDF_EXPECTS(options.is_enabled_prune_columns() == false, "column pruning has not yet been implemented"); - // traverse the column tree - auto num_columns = d_column_tree.rowidx.size() - 1; - d_column_tree.is_mixed_type_column.resize(num_columns, 0); - d_column_tree.is_pruned.resize(num_columns, 0); - - // for ignore_vals, we need to identify leaf nodes that have non-leaf sibling nodes - // i.e. we need to ignore leaf nodes at level above the last level - // idea: leaf nodes have adjacency 1. So if there is an adjacency 1 inbetween non-one - // adjacencies, then found the leaf node. Corner case: consider the last set of consecutive - // ones. If the leftmost of those ones (say node u) has a non-leaf sibling - // (can be found by looking at the adjacencies of the siblings - // (which are in turn found from the colidx of the parent u), then this leaf node should be - // ignored, otherwise all good. - rmm::device_uvector adjacency( - num_columns + 1, - stream); // since adjacent_difference requires that the output have the same length as input - thrust::adjacent_difference(rmm::exec_policy(stream), - d_column_tree.rowidx.begin(), - d_column_tree.rowidx.end(), - adjacency.begin()); - auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), - adjacency.begin() + 1, - adjacency.end(), - [] __device__(auto const adj) { return adj == 1; }); - rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); - thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_columns, - leaf_nodes.begin(), - [adjacency = adjacency.begin()] __device__(size_t node) { return adjacency[node] == 1; }); - - auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); - auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); - auto is_leftmost_leaf = thrust::mismatch( - rmm::exec_policy(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); - // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second - // - 1) - NodeIndexT leftmost_leaf_node = leaf_nodes.element( - num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); - - // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes - // of v are non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is - // the leftmost leaf node. Otherwise, discard all children of v after and including u - auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), - d_column_tree.rowidx.begin(), - d_column_tree.rowidx.end(), - leftmost_leaf_node); - NodeIndexT parent = thrust::distance(d_column_tree.rowidx.begin(), parent_it - 1); - NodeIndexT parent_adj_start = d_column_tree.rowidx.element(parent, stream); - NodeIndexT parent_adj_end = d_column_tree.rowidx.element(parent + 1, stream); - auto childnum_it = thrust::lower_bound(rmm::exec_policy(stream), - d_column_tree.colidx.begin() + parent_adj_start, - d_column_tree.colidx.begin() + parent_adj_end, - leftmost_leaf_node); - - auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - - 1; - if (childnum_it != d_column_tree.colidx.begin() + parent_adj_start + 1) { - // discarding from u to last child of parent - retained_leaf_nodes_it += - thrust::distance(childnum_it, d_column_tree.colidx.begin() + parent_adj_end); - } - - // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they - // are part of ignore_vals } } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index aeb06574d42..de4b1486752 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -366,6 +366,7 @@ std::tuple> reduce_to_column_tre device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream); void make_device_json_column_csr(device_span input, From b04cebcde4d1e385d8307ca5f641301f2553fbeb Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 17 Jul 2024 00:17:11 +0000 Subject: [PATCH 14/28] formatting --- cpp/src/io/json/json_column_csr.cu | 102 ++++++++++++++++------------- cpp/src/io/json/nested_json.hpp | 1 - cpp/tests/io/json_tree_csr.cu | 1 - 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 6d649a79b23..c2d7fb7cec2 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -59,11 +59,17 @@ struct device_json_column_properties_size { rmm::device_uvector outcol_nodes; size_t string_offsets_size = 0; size_t string_lengths_size = 0; - size_t child_offsets_size = 0; - size_t num_rows_size = 0; + size_t child_offsets_size = 0; + size_t num_rows_size = 0; }; -device_json_column_properties_size estimate_device_json_column_size(rmm::device_uvector const &rowidx, rmm::device_uvector const &colidx, rmm::device_uvector const &categories, cudf::io::json_reader_options reader_options, rmm::cuda_stream_view stream) { +device_json_column_properties_size estimate_device_json_column_size( + rmm::device_uvector const& rowidx, + rmm::device_uvector const& colidx, + rmm::device_uvector const& categories, + cudf::io::json_reader_options reader_options, + rmm::cuda_stream_view stream) +{ // What are the cases in which estimation works? CUDF_EXPECTS(reader_options.is_enabled_mixed_types_as_string() == false, "mixed type as string has not yet been implemented"); @@ -72,56 +78,63 @@ device_json_column_properties_size estimate_device_json_column_size(rmm::device_ // traverse the column tree auto num_columns = rowidx.size() - 1; - // 1. TODO: removing NC_ERR nodes and their descendants i.e. + // 1. TODO: removing NC_ERR nodes and their descendants i.e. // removing the entire subtree rooted at the nodes with category NC_ERR // for now, we just assert that there are indeed no error nodes - auto num_err_nodes = thrust::count_if(rmm::exec_policy(stream), - categories.begin(), - categories.end(), - [] __device__(auto const ctg) { return ctg == NC_ERR; }); + auto num_err_nodes = thrust::count_if( + rmm::exec_policy(stream), categories.begin(), categories.end(), [] __device__(auto const ctg) { + return ctg == NC_ERR; + }); CUDF_EXPECTS(num_err_nodes == 0, "oops, there are some error nodes in the column tree!"); // 2. Let's do some validation of the column tree based on its properties. - // We will be using these properties to filter nodes later on. + // We will be using these properties to filter nodes later on. // =========================================================================== // (i) Every node v is of type string, val, field name, list or struct. // (ii) String and val cannot have any children i.e. they can only be leaf nodes // (iii) If v is a field name, it can have struct, list, string and val as children. // (iv) If v is a struct, it can have a field name as child // (v) If v is a list, it can have string, val, list or struct as child - // (vi) There can only be at most one string and one val child for a given node, but many struct, list and field name children. - // (vii) When mixed type support is disabled - - // (a) A mix of lists and structs in the same column is not supported i.e a field name and list node cannot have both list and struct as children - // (b) If there is a mix of str/val and list/struct in the same column, then str/val is discarded + // (vi) There can only be at most one string and one val child for a given node, but many struct, + // list and field name children. (vii) When mixed type support is disabled - + // (a) A mix of lists and structs in the same column is not supported i.e a field name and + // list node cannot have both list and struct as children (b) If there is a mix of str/val + // and list/struct in the same column, then str/val is discarded // Validation of (vii)(a) - auto num_field_and_list_nodes = thrust::count_if(rmm::exec_policy(stream), categories.begin(), categories.end(), - [] __device__(auto const ctg) { - return ctg == NC_FN || ctg == NC_LIST; - }); + auto num_field_and_list_nodes = thrust::count_if( + rmm::exec_policy(stream), categories.begin(), categories.end(), [] __device__(auto const ctg) { + return ctg == NC_FN || ctg == NC_LIST; + }); rmm::device_uvector field_and_list_nodes(num_field_and_list_nodes, stream); - thrust::partition_copy(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_columns, field_and_list_nodes.begin(), - thrust::make_discard_iterator(), - [categories = categories.begin()] __device__(NodeIndexT node) { - return categories[node] == NC_LIST || categories[node] == NC_FN; - }); - bool is_valid_tree = thrust::all_of(rmm::exec_policy(stream), field_and_list_nodes.begin(), field_and_list_nodes.end(), - [rowidx = rowidx.begin(), - colidx = colidx.begin(), - categories = categories.begin()] __device__(NodeIndexT node) { + thrust::partition_copy(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + field_and_list_nodes.begin(), + thrust::make_discard_iterator(), + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_LIST || categories[node] == NC_FN; + }); + bool is_valid_tree = thrust::all_of( + rmm::exec_policy(stream), + field_and_list_nodes.begin(), + field_and_list_nodes.end(), + [rowidx = rowidx.begin(), colidx = colidx.begin(), categories = categories.begin()] __device__( + NodeIndexT node) { NodeIndexT first_child_pos = rowidx[node] + 1; - NodeIndexT last_child_pos = rowidx[node + 1] - 1; - bool has_struct_child = false; - bool has_list_child = false; - for(NodeIndexT child_pos = first_child_pos; child_pos <= last_child_pos; child_pos++) { - if(categories[colidx[child_pos]] == NC_STRUCT) has_struct_child = true; - if(categories[colidx[child_pos]] == NC_LIST) has_list_child = true; + NodeIndexT last_child_pos = rowidx[node + 1] - 1; + bool has_struct_child = false; + bool has_list_child = false; + for (NodeIndexT child_pos = first_child_pos; child_pos <= last_child_pos; child_pos++) { + if (categories[colidx[child_pos]] == NC_STRUCT) has_struct_child = true; + if (categories[colidx[child_pos]] == NC_LIST) has_list_child = true; } return !has_struct_child && !has_list_child; }); - CUDF_EXPECTS(is_valid_tree, "Invalidating property 7a i.e. mix of LIST and STRUCT in same column is not supported when mixed type support is disabled"); + CUDF_EXPECTS(is_valid_tree, + "Invalidating property 7a i.e. mix of LIST and STRUCT in same column is not " + "supported when mixed type support is disabled"); // Validation of (vii)(b) i.e. ignore_vals in previous implementation // We need to identify leaf nodes that have non-leaf sibling nodes @@ -135,10 +148,8 @@ device_json_column_properties_size estimate_device_json_column_size(rmm::device_ rmm::device_uvector adjacency( num_columns + 1, stream); // since adjacent_difference requires that the output have the same length as input - thrust::adjacent_difference(rmm::exec_policy(stream), - rowidx.begin(), - rowidx.end(), - adjacency.begin()); + thrust::adjacent_difference( + rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), adjacency.begin()); auto num_leaf_nodes = thrust::count_if(rmm::exec_policy(stream), adjacency.begin() + 1, adjacency.end(), @@ -164,10 +175,8 @@ device_json_column_properties_size estimate_device_json_column_size(rmm::device_ // of v is non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is // the leftmost leaf node. Otherwise, discard all children of v after and including u - auto parent_it = thrust::upper_bound(rmm::exec_policy(stream), - rowidx.begin(), - rowidx.end(), - leftmost_leaf_node); + auto parent_it = + thrust::upper_bound(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), leftmost_leaf_node); NodeIndexT parent = thrust::distance(rowidx.begin(), parent_it - 1); NodeIndexT parent_adj_start = rowidx.element(parent, stream); NodeIndexT parent_adj_end = rowidx.element(parent + 1, stream); @@ -181,14 +190,13 @@ device_json_column_properties_size estimate_device_json_column_size(rmm::device_ 1; if (childnum_it != colidx.begin() + parent_adj_start + 1) { // discarding from u to last child of parent - retained_leaf_nodes_it += - thrust::distance(childnum_it, colidx.begin() + parent_adj_end); + retained_leaf_nodes_it += thrust::distance(childnum_it, colidx.begin() + parent_adj_end); } // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they // are part of ignore_vals // (Optional?) TODO: Validation of the remaining column tree properties - + rmm::device_uvector outcol_nodes(num_columns, stream); return device_json_column_properties_size{std::move(outcol_nodes)}; } @@ -491,7 +499,8 @@ std::tuple> reduce_to_column_tre [] __device__(auto i) { return i + 1; }, [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); - auto size_estimates = estimate_device_json_column_size(rowidx, colidx, csr_column_categories, reader_options, stream); + auto size_estimates = + estimate_device_json_column_size(rowidx, colidx, csr_column_categories, reader_options, stream); return std::tuple{column_tree_csr{std::move(rowidx), std::move(colidx), @@ -573,7 +582,6 @@ void make_device_json_column_csr(device_span input, "mixed type as string has not yet been implemented"); CUDF_EXPECTS(options.is_enabled_prune_columns() == false, "column pruning has not yet been implemented"); - } } // namespace cudf::io::json::experimental::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 16c27aed47b..83e854827d3 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -379,7 +379,6 @@ reduce_to_column_tree(tree_meta_t& tree, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); - /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index 6d52e732bd1..37118b3f086 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -185,7 +185,6 @@ TEST_F(JsonColumnTreeTests, SimpleLines) options, stream); - auto iseq = check_equality(d_column_tree, d_column_tree_csr, stream); // assert equality between csr and meta formats assert(iseq == true); From b804209760c8f076903f6ca40fb165720b4e32b0 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Sun, 21 Jul 2024 06:06:11 +0000 Subject: [PATCH 15/28] partial work commit --- cpp/src/io/json/json_column_csr.cu | 107 ++++++++++++++++++++++++++--- cpp/src/io/json/nested_json.hpp | 10 ++- 2 files changed, 100 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index c2d7fb7cec2..b4dbb455371 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -55,18 +57,22 @@ namespace cudf::io::json::experimental::detail { -struct device_json_column_properties_size { - rmm::device_uvector outcol_nodes; - size_t string_offsets_size = 0; - size_t string_lengths_size = 0; - size_t child_offsets_size = 0; - size_t num_rows_size = 0; +struct device_column_subtree { + using row_offset_t = size_type; + rmm::device_uvector subtree_nrows; + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; + // Row offsets + rmm::device_uvector child_offsets; + // Validity bitmap + rmm::device_buffer validity; }; -device_json_column_properties_size estimate_device_json_column_size( +device_column_subtree allocation_for_device_column_subtree_annotation( rmm::device_uvector const& rowidx, rmm::device_uvector const& colidx, rmm::device_uvector const& categories, + rmm::device_uvector const& max_row_offsets, cudf::io::json_reader_options reader_options, rmm::cuda_stream_view stream) { @@ -197,8 +203,40 @@ device_json_column_properties_size estimate_device_json_column_size( // (Optional?) TODO: Validation of the remaining column tree properties - rmm::device_uvector outcol_nodes(num_columns, stream); - return device_json_column_properties_size{std::move(outcol_nodes)}; + // Now we annotate the extracted subtree + using row_offset_t = size_type; + auto num_subtree_nodes = thrust::distance(retained_leaf_nodes_it, leaf_nodes.end()); + rmm::device_uvector subtree_nrows(max_row_offsets, stream); + thrust::scatter(rmm::exec_policy(stream), thrust::make_constant_iterator(-1), thrust::make_constant_iterator(-1) + num_columns - num_subtree_nodes, leaf_nodes.begin(), subtree_nrows.begin()); + thrust::transform(rmm::exec_policy(stream), thrust::make_constant_iterator(1), thrust::make_constant_iterator(1) + num_columns, subtree_nrows.begin(), subtree_nrows.begin(), thrust::plus()); + + // For the subtree, we allocate memory for device column subtree properties + rmm::device_uvector subtree_properties_map(num_columns, stream); + thrust::sequence(rmm::exec_policy(stream), subtree_properties_map.begin(), subtree_properties_map.end(), 0); + auto partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), subtree_properties_map.begin(), subtree_properties_map.end(), subtree_nrows.begin(), thrust::identity()); + auto str_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), subtree_properties_map.begin(), partitioning_idx_it, + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_STR || categories[node] == NC_VAL; + }); + auto str_val_end = thrust::distance(subtree_properties_map.begin(), str_partitioning_idx_it); + auto max_row_offsets_perm_it = thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()); + size_type string_offsets_size = thrust::reduce(rmm::exec_policy(stream), max_row_offsets_perm_it, max_row_offsets_perm_it + str_val_end) + str_val_end; + rmm::device_uvector string_offsets(string_offsets_size, stream); + rmm::device_uvector string_lengths(string_offsets_size, stream); + + auto list_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), str_partitioning_idx_it, partitioning_idx_it, + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_LIST; + }); + auto list_end = thrust::distance(subtree_properties_map.begin(), list_partitioning_idx_it); + max_row_offsets_perm_it = thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()) + str_val_end; + size_type child_offsets_size = thrust::reduce(rmm::exec_policy(stream), max_row_offsets_perm_it, max_row_offsets_perm_it + (list_end - str_val_end)) + 2 * (list_end - str_val_end); + rmm::device_uvector child_offsets(child_offsets_size, stream); + + auto validity_buffer_size = thrust::reduce(rmm::exec_policy(stream), subtree_nrows.begin(), subtree_nrows.end()); + auto validity = cudf::detail::create_null_mask(validity_buffer_size, cudf::mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource()); + + return device_column_subtree{std::move(subtree_nrows), std::move(string_offsets), std::move(string_lengths), std::move(child_offsets), std::move(validity)}; } /** @@ -499,8 +537,55 @@ std::tuple> reduce_to_column_tre [] __device__(auto i) { return i + 1; }, [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); - auto size_estimates = - estimate_device_json_column_size(rowidx, colidx, csr_column_categories, reader_options, stream); + // this function allocates memory for the annotation + auto device_column_subtree_obj = + allocation_for_device_column_subtree_annotation(rowidx, colidx, csr_column_categories, csr_max_row_offsets, reader_options, stream); + // now we actually do the annotation + // relabel original_col_ids with the positions of the csr_unique_col_ids with same element. How do we accomplish this? + // one idea is to sort the row offsets by node level. Just the way we did this for the csr_column_ids + // sort original_col_ids, extract subtree based on the annotation above, + using row_offset_t = size_type; + auto [sorted_node_levels, sorted_node_levels_order] = cudf::io::json::detail::stable_sorted_key_order(tree.node_levels, stream); + auto num_nodes = original_col_ids.size(); + auto row_offsets_it = thrust::make_permutation_iterator(row_offsets.begin(), sorted_node_levels_order.begin()); + auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), sorted_node_levels_order.begin()); + auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), sorted_node_levels_order.begin()); + auto node_col_ids_it = thrust::make_permutation_iterator(original_col_ids.begin(), sorted_node_levels_order.begin()); + auto node_categories_it = thrust::make_permutation_iterator(tree.node_categories.begin(), sorted_node_levels_order.begin()); + + rmm::device_uvector sorted_subtree_nrows(device_column_subtree_obj.subtree_nrows, stream); + rmm::device_uvector sorted_csr_unique_col_ids(csr_unique_col_ids, stream); + thrust::sort_by_key(rmm::exec_policy(stream), sorted_csr_unique_col_ids.begin(), sorted_csr_unique_col_ids.end(), sorted_subtree_nrows.begin()); + thrust::copy_if(rmm::exec_policy(stream), node_range_begin_it, node_range_begin_it + num_nodes, thrust::make_counting_iterator(0), device_column_subtree_obj.string_offsets.begin(), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it, node_categories_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] && (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); + }); + + auto node_range_lengths_it = thrust::make_transform_iterator(thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), + cuda::proclaim_return_type([] __device__(auto range_it) { + return thrust::get<1>(range_it) - thrust::get<0>(range_it); + })); + thrust::copy_if(rmm::exec_policy(stream), node_range_lengths_it, node_range_lengths_it + num_nodes, thrust::make_counting_iterator(0), device_column_subtree_obj.string_lengths.begin(), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it, node_categories_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] && (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); + }); + + // row_offsets need to be prefix summed across columns! + thrust::replace_if(rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, thrust::make_counting_iterator(0), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] > 0; + }, 0); + thrust::inclusive_scan(rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), num_nodes, + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it, node_categories_it, row_offsets_it, + validity = static_cast(device_column_subtree_obj.validity.data())] __device__(NodeIndexT node) { + if(sorted_subtree_nrows[node_col_ids_it[node]] && node_categories_it[node] != NC_LIST) + cudf::set_bit(validity, row_offsets_it[node]); + }); return std::tuple{column_tree_csr{std::move(rowidx), std::move(colidx), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 83e854827d3..cbb195bc95b 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -208,19 +208,17 @@ struct column_tree_csr { rmm::device_uvector range_begin; rmm::device_uvector range_end; // device_json_column properties - // Type used to count number of rows - /* using row_offset_t = size_type; - // The inferred type of this column (list, struct, or value/string column) - std::vector types; + // Indicator array for the device column subtree + // Stores the number of rows in the column if the node is part of device column subtree + // Stores zero otherwise + rmm::device_uvector subtree_nrows; rmm::device_uvector string_offsets; rmm::device_uvector string_lengths; // Row offsets rmm::device_uvector child_offsets; // Validity bitmap rmm::device_buffer validity; - std::vector num_rows; - */ }; namespace detail { From b8e8c07f0891e714d7c9d6de8f018f4beb78e660 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 22 Jul 2024 16:50:10 +0000 Subject: [PATCH 16/28] formatting --- cpp/src/io/json/json_column_csr.cu | 189 ++++++++++++++++++++--------- 1 file changed, 132 insertions(+), 57 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index b4dbb455371..509dd84846e 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -49,8 +49,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -204,39 +204,77 @@ device_column_subtree allocation_for_device_column_subtree_annotation( // (Optional?) TODO: Validation of the remaining column tree properties // Now we annotate the extracted subtree - using row_offset_t = size_type; + using row_offset_t = size_type; auto num_subtree_nodes = thrust::distance(retained_leaf_nodes_it, leaf_nodes.end()); rmm::device_uvector subtree_nrows(max_row_offsets, stream); - thrust::scatter(rmm::exec_policy(stream), thrust::make_constant_iterator(-1), thrust::make_constant_iterator(-1) + num_columns - num_subtree_nodes, leaf_nodes.begin(), subtree_nrows.begin()); - thrust::transform(rmm::exec_policy(stream), thrust::make_constant_iterator(1), thrust::make_constant_iterator(1) + num_columns, subtree_nrows.begin(), subtree_nrows.begin(), thrust::plus()); + thrust::scatter(rmm::exec_policy(stream), + thrust::make_constant_iterator(-1), + thrust::make_constant_iterator(-1) + num_columns - num_subtree_nodes, + leaf_nodes.begin(), + subtree_nrows.begin()); + thrust::transform(rmm::exec_policy(stream), + thrust::make_constant_iterator(1), + thrust::make_constant_iterator(1) + num_columns, + subtree_nrows.begin(), + subtree_nrows.begin(), + thrust::plus()); // For the subtree, we allocate memory for device column subtree properties rmm::device_uvector subtree_properties_map(num_columns, stream); - thrust::sequence(rmm::exec_policy(stream), subtree_properties_map.begin(), subtree_properties_map.end(), 0); - auto partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), subtree_properties_map.begin(), subtree_properties_map.end(), subtree_nrows.begin(), thrust::identity()); - auto str_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), subtree_properties_map.begin(), partitioning_idx_it, - [categories = categories.begin()] __device__(NodeIndexT node) { - return categories[node] == NC_STR || categories[node] == NC_VAL; - }); + thrust::sequence( + rmm::exec_policy(stream), subtree_properties_map.begin(), subtree_properties_map.end(), 0); + auto partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), + subtree_properties_map.begin(), + subtree_properties_map.end(), + subtree_nrows.begin(), + thrust::identity()); + auto str_partitioning_idx_it = + thrust::partition(rmm::exec_policy(stream), + subtree_properties_map.begin(), + partitioning_idx_it, + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_STR || categories[node] == NC_VAL; + }); auto str_val_end = thrust::distance(subtree_properties_map.begin(), str_partitioning_idx_it); - auto max_row_offsets_perm_it = thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()); - size_type string_offsets_size = thrust::reduce(rmm::exec_policy(stream), max_row_offsets_perm_it, max_row_offsets_perm_it + str_val_end) + str_val_end; + auto max_row_offsets_perm_it = + thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()); + size_type string_offsets_size = + thrust::reduce( + rmm::exec_policy(stream), max_row_offsets_perm_it, max_row_offsets_perm_it + str_val_end) + + str_val_end; rmm::device_uvector string_offsets(string_offsets_size, stream); rmm::device_uvector string_lengths(string_offsets_size, stream); - auto list_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), str_partitioning_idx_it, partitioning_idx_it, - [categories = categories.begin()] __device__(NodeIndexT node) { - return categories[node] == NC_LIST; - }); + auto list_partitioning_idx_it = + thrust::partition(rmm::exec_policy(stream), + str_partitioning_idx_it, + partitioning_idx_it, + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_LIST; + }); auto list_end = thrust::distance(subtree_properties_map.begin(), list_partitioning_idx_it); - max_row_offsets_perm_it = thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()) + str_val_end; - size_type child_offsets_size = thrust::reduce(rmm::exec_policy(stream), max_row_offsets_perm_it, max_row_offsets_perm_it + (list_end - str_val_end)) + 2 * (list_end - str_val_end); + max_row_offsets_perm_it = + thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()) + + str_val_end; + size_type child_offsets_size = + thrust::reduce(rmm::exec_policy(stream), + max_row_offsets_perm_it, + max_row_offsets_perm_it + (list_end - str_val_end)) + + 2 * (list_end - str_val_end); rmm::device_uvector child_offsets(child_offsets_size, stream); - auto validity_buffer_size = thrust::reduce(rmm::exec_policy(stream), subtree_nrows.begin(), subtree_nrows.end()); - auto validity = cudf::detail::create_null_mask(validity_buffer_size, cudf::mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource()); - - return device_column_subtree{std::move(subtree_nrows), std::move(string_offsets), std::move(string_lengths), std::move(child_offsets), std::move(validity)}; + auto validity_buffer_size = + thrust::reduce(rmm::exec_policy(stream), subtree_nrows.begin(), subtree_nrows.end()); + auto validity = cudf::detail::create_null_mask(validity_buffer_size, + cudf::mask_state::ALL_NULL, + stream, + rmm::mr::get_current_device_resource()); + + return device_column_subtree{std::move(subtree_nrows), + std::move(string_offsets), + std::move(string_lengths), + std::move(child_offsets), + std::move(validity)}; } /** @@ -538,54 +576,91 @@ std::tuple> reduce_to_column_tre [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); // this function allocates memory for the annotation - auto device_column_subtree_obj = - allocation_for_device_column_subtree_annotation(rowidx, colidx, csr_column_categories, csr_max_row_offsets, reader_options, stream); + auto device_column_subtree_obj = allocation_for_device_column_subtree_annotation( + rowidx, colidx, csr_column_categories, csr_max_row_offsets, reader_options, stream); // now we actually do the annotation - // relabel original_col_ids with the positions of the csr_unique_col_ids with same element. How do we accomplish this? - // one idea is to sort the row offsets by node level. Just the way we did this for the csr_column_ids - // sort original_col_ids, extract subtree based on the annotation above, + // relabel original_col_ids with the positions of the csr_unique_col_ids with same element. How do + // we accomplish this? one idea is to sort the row offsets by node level. Just the way we did this + // for the csr_column_ids sort original_col_ids, extract subtree based on the annotation above, using row_offset_t = size_type; - auto [sorted_node_levels, sorted_node_levels_order] = cudf::io::json::detail::stable_sorted_key_order(tree.node_levels, stream); + auto [sorted_node_levels, sorted_node_levels_order] = + cudf::io::json::detail::stable_sorted_key_order(tree.node_levels, stream); auto num_nodes = original_col_ids.size(); - auto row_offsets_it = thrust::make_permutation_iterator(row_offsets.begin(), sorted_node_levels_order.begin()); - auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), sorted_node_levels_order.begin()); - auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), sorted_node_levels_order.begin()); - auto node_col_ids_it = thrust::make_permutation_iterator(original_col_ids.begin(), sorted_node_levels_order.begin()); - auto node_categories_it = thrust::make_permutation_iterator(tree.node_categories.begin(), sorted_node_levels_order.begin()); - - rmm::device_uvector sorted_subtree_nrows(device_column_subtree_obj.subtree_nrows, stream); + auto row_offsets_it = + thrust::make_permutation_iterator(row_offsets.begin(), sorted_node_levels_order.begin()); + auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), + sorted_node_levels_order.begin()); + auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), + sorted_node_levels_order.begin()); + auto node_col_ids_it = + thrust::make_permutation_iterator(original_col_ids.begin(), sorted_node_levels_order.begin()); + auto node_categories_it = thrust::make_permutation_iterator(tree.node_categories.begin(), + sorted_node_levels_order.begin()); + + rmm::device_uvector sorted_subtree_nrows(device_column_subtree_obj.subtree_nrows, + stream); rmm::device_uvector sorted_csr_unique_col_ids(csr_unique_col_ids, stream); - thrust::sort_by_key(rmm::exec_policy(stream), sorted_csr_unique_col_ids.begin(), sorted_csr_unique_col_ids.end(), sorted_subtree_nrows.begin()); - thrust::copy_if(rmm::exec_policy(stream), node_range_begin_it, node_range_begin_it + num_nodes, thrust::make_counting_iterator(0), device_column_subtree_obj.string_offsets.begin(), + thrust::sort_by_key(rmm::exec_policy(stream), + sorted_csr_unique_col_ids.begin(), + sorted_csr_unique_col_ids.end(), + sorted_subtree_nrows.begin()); + thrust::copy_if( + rmm::exec_policy(stream), + node_range_begin_it, + node_range_begin_it + num_nodes, + thrust::make_counting_iterator(0), + device_column_subtree_obj.string_offsets.begin(), [sorted_subtree_nrows = sorted_subtree_nrows.begin(), - node_col_ids_it, node_categories_it] __device__(NodeIndexT node) { - return sorted_subtree_nrows[node_col_ids_it[node]] && (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); - }); + node_col_ids_it, + node_categories_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] && + (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); + }); - auto node_range_lengths_it = thrust::make_transform_iterator(thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), + auto node_range_lengths_it = thrust::make_transform_iterator( + thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), cuda::proclaim_return_type([] __device__(auto range_it) { return thrust::get<1>(range_it) - thrust::get<0>(range_it); - })); - thrust::copy_if(rmm::exec_policy(stream), node_range_lengths_it, node_range_lengths_it + num_nodes, thrust::make_counting_iterator(0), device_column_subtree_obj.string_lengths.begin(), + })); + thrust::copy_if( + rmm::exec_policy(stream), + node_range_lengths_it, + node_range_lengths_it + num_nodes, + thrust::make_counting_iterator(0), + device_column_subtree_obj.string_lengths.begin(), [sorted_subtree_nrows = sorted_subtree_nrows.begin(), - node_col_ids_it, node_categories_it] __device__(NodeIndexT node) { - return sorted_subtree_nrows[node_col_ids_it[node]] && (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); - }); + node_col_ids_it, + node_categories_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] && + (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); + }); // row_offsets need to be prefix summed across columns! - thrust::replace_if(rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, thrust::make_counting_iterator(0), + thrust::replace_if( + rmm::exec_policy(stream), + row_offsets_it, + row_offsets_it + num_nodes, + thrust::make_counting_iterator(0), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), node_col_ids_it] __device__( + NodeIndexT node) { return sorted_subtree_nrows[node_col_ids_it[node]] > 0; }, + 0); + thrust::inclusive_scan( + rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_nodes, [sorted_subtree_nrows = sorted_subtree_nrows.begin(), - node_col_ids_it] __device__(NodeIndexT node) { - return sorted_subtree_nrows[node_col_ids_it[node]] > 0; - }, 0); - thrust::inclusive_scan(rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); - thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), num_nodes, - [sorted_subtree_nrows = sorted_subtree_nrows.begin(), - node_col_ids_it, node_categories_it, row_offsets_it, - validity = static_cast(device_column_subtree_obj.validity.data())] __device__(NodeIndexT node) { - if(sorted_subtree_nrows[node_col_ids_it[node]] && node_categories_it[node] != NC_LIST) + node_col_ids_it, + node_categories_it, + row_offsets_it, + validity = static_cast( + device_column_subtree_obj.validity.data())] __device__(NodeIndexT node) { + if (sorted_subtree_nrows[node_col_ids_it[node]] && node_categories_it[node] != NC_LIST) cudf::set_bit(validity, row_offsets_it[node]); - }); + }); + + // scatter list offsets return std::tuple{column_tree_csr{std::move(rowidx), std::move(colidx), From 5541b93db7ac3dae2b7d0e207213397911b9a12a Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 24 Jul 2024 22:59:45 +0000 Subject: [PATCH 17/28] partial commit --- cpp/src/io/json/json_column_csr.cu | 241 +++++++++++++---------------- cpp/src/io/json/nested_json.hpp | 25 ++- cpp/tests/io/json_tree_csr.cu | 1 + 3 files changed, 127 insertions(+), 140 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index ee65dbc3bc8..866820fa67c 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,25 @@ namespace cudf::io::json::experimental::detail { +using row_offset_t = size_type; + +struct unvalidated_column_tree { + rmm::device_uvector rowidx; + rmm::device_uvector colidx; + rmm::device_uvector max_row_offsets; + rmm::device_uvector column_categories; +}; + +struct level_ordering { + device_span node_levels; + device_span col_ids; + __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const + { + return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) || + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && col_ids[lhs_node_id] < col_ids[rhs_node_id]); + } +}; + /** * @brief Reduces node tree representation to column tree CSR representation. * @@ -65,129 +85,93 @@ namespace cudf::io::json::experimental::detail { * @return A tuple of column tree representation of JSON string, column ids of columns, and * max row offsets of columns */ -std::tuple> reduce_to_column_tree_csr( +unvalidated_column_tree reduce_to_column_tree_csr( tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, + device_span col_ids, device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); - // 1. column count for allocation - auto const num_columns = - thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); - rmm::device_uvector unique_node_ids(num_columns, stream); - rmm::device_uvector csr_unique_node_ids(num_columns, stream); - rmm::device_uvector column_levels(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_node_ids.begin(), - thrust::make_discard_iterator(), - unique_node_ids.begin()); - thrust::copy_n( - rmm::exec_policy(stream), - thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), - unique_node_ids.size(), - column_levels.begin()); - auto [sorted_column_levels, sorted_column_levels_order] = - cudf::io::json::detail::stable_sorted_key_order(column_levels, stream); + rmm::device_uvector level_ordered_col_ids(col_ids.size(), stream); + rmm::device_uvector level_ordered_node_ids(col_ids.size(), stream); + thrust::copy(rmm::exec_policy_nosync(stream), col_ids.begin(), col_ids.end(), level_ordered_col_ids.begin()); + thrust::sequence(rmm::exec_policy_nosync(stream), level_ordered_node_ids.begin(), level_ordered_node_ids.end()); + + // Reorder nodes and column ids in level-wise fashion + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), level_ordered_node_ids.begin(), level_ordered_node_ids.end(), + level_ordered_col_ids.begin(), level_ordering{tree.node_levels, col_ids}); - // 2. reduce_by_key {col_id}, {row_offset}, max. - rmm::device_uvector unique_col_ids(num_columns, stream); + // 1. get the number of columns in tree, mapping between node tree col ids and csr col ids, and the node id of first row in each column + auto const num_columns = + thrust::unique_count(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end()); + rmm::device_uvector level_ordered_unique_node_ids(num_columns, stream); + rmm::device_uvector mapped_col_ids(num_columns, stream); + thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_node_ids.end(), level_ordered_node_ids.begin(), mapped_col_ids.begin(), level_ordered_unique_node_ids.begin()); + auto rev_mapped_col_ids_it = thrust::make_permutation_iterator(thrust::make_counting_iterator(0), mapped_col_ids.begin()); + + // 2. maximum number of rows per column: computed with reduce_by_key {col_id}, {row_offset}, max. + // 3. category for each column node by aggregating all nodes in node tree corresponding to same column: + // reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) rmm::device_uvector max_row_offsets(num_columns, stream); - rmm::device_uvector csr_unique_col_ids(num_columns, stream); - rmm::device_uvector csr_max_row_offsets(num_columns, stream); + rmm::device_uvector column_categories(num_columns, stream); auto ordered_row_offsets = - thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); + thrust::make_permutation_iterator(row_offsets.begin(), level_ordered_node_ids.begin()); + auto ordered_node_categories = thrust::make_permutation_iterator(tree.node_categories.begin(), level_ordered_node_ids.begin()); thrust::reduce_by_key(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_row_offsets, - unique_col_ids.begin(), - max_row_offsets.begin(), - thrust::equal_to(), - thrust::maximum()); - - // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) - rmm::device_uvector column_categories(num_columns, stream); - rmm::device_uvector csr_column_categories(num_columns, stream); - thrust::reduce_by_key( - rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), - unique_col_ids.begin(), - column_categories.begin(), - thrust::equal_to(), - [] __device__(NodeT type_a, NodeT type_b) -> NodeT { - auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); - auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); - // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) - // *+*=*, v+v=v - if (type_a == type_b) { - return type_a; - } else if (is_a_leaf) { - // *+v=*, N+V=N - // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR - return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); - } else if (is_b_leaf) { - return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); - } - // *+#=E - return NC_ERR; - }); + level_ordered_col_ids.begin(), + level_ordered_col_ids.end(), + thrust::make_zip_iterator(thrust::make_tuple(ordered_row_offsets, ordered_node_categories)), + thrust::make_discard_iterator(), + thrust::make_zip_iterator(thrust::make_tuple(max_row_offsets.begin(), column_categories.begin())), + thrust::equal_to(), + [] __device__(auto a, auto b) { + auto row_offset_a = thrust::get<0>(a); + auto row_offset_b = thrust::get<0>(b); + auto type_a = thrust::get<1>(a); + auto type_b = thrust::get<1>(b); + + NodeT max_offset; + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + max_offset = type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + max_offset = type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + } else if (is_b_leaf) { + max_offset = type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + } + // *+#=E + max_offset = NC_ERR; - auto csr_permutation_it = thrust::make_zip_iterator( - thrust::make_permutation_iterator(unique_node_ids.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(unique_col_ids.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(max_row_offsets.begin(), sorted_column_levels_order.begin()), - thrust::make_permutation_iterator(column_categories.begin(), - sorted_column_levels_order.begin())); - thrust::copy(rmm::exec_policy(stream), - csr_permutation_it, - csr_permutation_it + num_columns, - thrust::make_zip_iterator(csr_unique_node_ids.begin(), - csr_unique_col_ids.begin(), - csr_max_row_offsets.begin(), - csr_column_categories.begin())); + thrust::maximum row_offset_op; + return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), max_offset); + }); - // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector csr_parent_col_ids(num_columns, stream); - rmm::device_uvector csr_col_range_begin(num_columns, stream); // Field names - rmm::device_uvector csr_col_range_end(num_columns, stream); + // 4. construct parent_col_ids using permutation iterator + rmm::device_uvector parent_col_ids(num_columns, stream); thrust::copy_n( rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_permutation_iterator(tree.parent_node_ids.begin(), csr_unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_begin.begin(), csr_unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_end.begin(), csr_unique_node_ids.begin())), - csr_unique_node_ids.size(), - thrust::make_zip_iterator( - csr_parent_col_ids.begin(), csr_col_range_begin.begin(), csr_col_range_end.begin())); - - // convert parent_node_ids to parent_col_ids - thrust::transform( - rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - csr_parent_col_ids.begin(), - [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { - return parent_node_id == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_id]; - }); + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), level_ordered_unique_node_ids.begin()), + num_columns, + thrust::make_transform_output_iterator(parent_col_ids.begin(), + [col_ids = col_ids.begin(), rev_mapped_col_ids_it] __device__(auto parent_node_id) -> NodeIndexT { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel : rev_mapped_col_ids_it[col_ids[parent_node_id]]; + })); /* - CSR construction: - 1. Sort column levels and get their ordering - 2. For each column node coln iterated according to sorted_column_levels; do - a. Find nodes that have coln as the parent node -> set adj_coln - b. row idx[coln] = size of adj_coln + 1 - c. col idx[coln] = adj_coln U {parent_col_id[coln]} + 5. CSR construction: + a. Sort column levels and get their ordering + b. For each column node coln iterated according to sorted_column_levels; do + i. Find nodes that have coln as the parent node -> set adj_coln + ii. row idx[coln] = size of adj_coln + 1 + iii. col idx[coln] = adj_coln U {parent_col_id[coln]} */ rmm::device_uvector rowidx(num_columns + 1, stream); @@ -196,28 +180,16 @@ std::tuple> reduce_to_column_tre // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) // children adjacency auto num_non_leaf_columns = thrust::unique_count( - rmm::exec_policy(stream), csr_parent_col_ids.begin() + 1, csr_parent_col_ids.end()); + rmm::exec_policy(stream), parent_col_ids.begin() + 1, parent_col_ids.end()); thrust::reduce_by_key(rmm::exec_policy(stream), - csr_parent_col_ids.begin() + 1, - csr_parent_col_ids.end(), + parent_col_ids.begin() + 1, + parent_col_ids.end(), thrust::make_constant_iterator(1), thrust::make_discard_iterator(), rowidx.begin() + 1, thrust::equal_to()); thrust::inclusive_scan( rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); - // overwrite the csr_parent_col_ids with the col ids in the csr tree - thrust::fill(rmm::exec_policy(stream), csr_parent_col_ids.begin(), csr_parent_col_ids.end(), -1); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_non_leaf_columns, - rowidx.begin(), - csr_parent_col_ids.begin() + 1); - thrust::inclusive_scan(rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - csr_parent_col_ids.begin(), - thrust::maximum{}); // We are discarding the parent of the root node. Add the parent adjacency. Since we have already // performed the scan, we use a counting iterator to add thrust::transform(rmm::exec_policy(stream), @@ -228,32 +200,31 @@ std::tuple> reduce_to_column_tre thrust::plus()); rmm::device_uvector colidx((num_columns - 1) * 2, stream); - thrust::fill(rmm::exec_policy(stream), colidx.begin(), colidx.end(), 0); + // Skip the parent of root node thrust::scatter(rmm::exec_policy(stream), - csr_parent_col_ids.begin() + 1, - csr_parent_col_ids.end(), + parent_col_ids.begin() + 1, + parent_col_ids.end(), rowidx.begin() + 1, colidx.begin()); - // excluding root node + // excluding root node, construct scatter map rmm::device_uvector map(num_columns - 1, stream); - thrust::fill(rmm::exec_policy(stream), map.begin(), map.end(), 1); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - csr_parent_col_ids.begin() + 1, - csr_parent_col_ids.end(), - map.begin(), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + thrust::make_constant_iterator(1), map.begin()); - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(1), - thrust::make_counting_iterator(1) + num_columns - 1, + num_columns - 1, [rowidx = rowidx.begin(), map = map.begin(), - csr_parent_col_ids = csr_parent_col_ids.begin()] __device__(auto i) { - auto csr_parent_col_id = csr_parent_col_ids[i]; - if (csr_parent_col_id == 0) + parent_col_ids = parent_col_ids.begin()] __device__(auto i) { + auto parent_col_id = parent_col_ids[i]; + if (parent_col_id == 0) map[i - 1]--; else - map[i - 1] += rowidx[csr_parent_col_id]; + map[i - 1] += rowidx[parent_col_id]; }); thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(1), @@ -281,6 +252,8 @@ std::tuple> reduce_to_column_tre list_parents_children_max_row_offsets.begin(), list_parents_children_max_row_offsets.end(), 0); + auto list_nodes = thrust::make_permutation_iterator + thrust::for_each(rmm::exec_policy(stream), csr_unique_col_ids.begin(), csr_unique_col_ids.end(), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 20019a703c9..1e617240159 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -187,15 +187,28 @@ struct device_json_column { }; namespace experimental { -struct column_tree_csr { +/* + * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json column + * subtree - the subgraph that conforms to column tree properties - is extracted and further processed + * according to the JSON reader options passed. Only the final processed subgraph is annotated with information + * required to construct cuDF columns. + */ +struct column_tree { // position of nnzs rmm::device_uvector rowidx; rmm::device_uvector colidx; - // node properties - rmm::device_uvector column_ids; - rmm::device_uvector categories; - rmm::device_uvector range_begin; - rmm::device_uvector range_end; + // device_json_column properties + using row_offset_t = size_type; + // Indicator array for the device column subtree + // Stores the number of rows in the column if the node is part of device column subtree + // Stores zero otherwise + rmm::device_uvector subtree_nrows; + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; + // Row offsets + rmm::device_uvector child_offsets; + // Validity bitmap + rmm::device_buffer validity; }; namespace detail { diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json_tree_csr.cu index 12e92551521..594d7dde640 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json_tree_csr.cu @@ -185,6 +185,7 @@ TEST_F(JsonColumnTreeTests, SimpleLines) stream); auto iseq = check_equality(d_column_tree, d_column_tree_csr, stream); + std::cout << "iseq = " << iseq << std::endl; // assert equality between csr and meta formats assert(iseq == true); } From d05e670916cf4ece469c2c76c8eedcafd93b9a65 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 30 Jul 2024 22:07:34 +0000 Subject: [PATCH 18/28] better csr construction --- cpp/src/io/json/json_column_csr.cu | 199 +++++++++-------------- cpp/src/io/json/nested_json.hpp | 19 ++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/io/{ => json}/json_tree_csr.cu | 50 +++--- 4 files changed, 121 insertions(+), 148 deletions(-) rename cpp/tests/io/{ => json}/json_tree_csr.cu (81%) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 866820fa67c..3bec413435c 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -48,19 +48,15 @@ #include #include #include +#include #include +#include + namespace cudf::io::json::experimental::detail { using row_offset_t = size_type; -struct unvalidated_column_tree { - rmm::device_uvector rowidx; - rmm::device_uvector colidx; - rmm::device_uvector max_row_offsets; - rmm::device_uvector column_categories; -}; - struct level_ordering { device_span node_levels; device_span col_ids; @@ -71,6 +67,24 @@ struct level_ordering { } }; +struct parent_nodeids_to_colids { + device_span col_ids; + device_span rev_mapped_col_ids; + __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel : rev_mapped_col_ids[col_ids[parent_node_id]]; + } +}; + +template +void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) { + auto h_vec = cudf::detail::make_std_vector_async(d_vec, stream); + std::cout << name << " = "; + for(auto e : h_vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} + /** * @brief Reduces node tree representation to column tree CSR representation. * @@ -85,10 +99,10 @@ struct level_ordering { * @return A tuple of column tree representation of JSON string, column ids of columns, and * max row offsets of columns */ -unvalidated_column_tree reduce_to_column_tree_csr( +std::tuple reduce_to_column_tree( tree_meta_t& tree, device_span col_ids, - device_span row_offsets, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream) @@ -109,13 +123,19 @@ unvalidated_column_tree reduce_to_column_tree_csr( thrust::unique_count(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end()); rmm::device_uvector level_ordered_unique_node_ids(num_columns, stream); rmm::device_uvector mapped_col_ids(num_columns, stream); + rmm::device_uvector rev_mapped_col_ids(num_columns, stream); thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_node_ids.end(), level_ordered_node_ids.begin(), mapped_col_ids.begin(), level_ordered_unique_node_ids.begin()); - auto rev_mapped_col_ids_it = thrust::make_permutation_iterator(thrust::make_counting_iterator(0), mapped_col_ids.begin()); + auto *dev_num_levels_ptr = thrust::max_element(rmm::exec_policy(stream), tree.node_levels.begin(), tree.node_levels.end()); + + rmm::device_uvector mapped_col_ids_copy(num_columns, stream); + thrust::copy(rmm::exec_policy(stream), mapped_col_ids.begin(), mapped_col_ids.end(), mapped_col_ids_copy.begin()); + thrust::sequence(rmm::exec_policy(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); + thrust::sort_by_key(rmm::exec_policy(stream), mapped_col_ids_copy.begin(), mapped_col_ids_copy.end(), rev_mapped_col_ids.begin()); // 2. maximum number of rows per column: computed with reduce_by_key {col_id}, {row_offset}, max. // 3. category for each column node by aggregating all nodes in node tree corresponding to same column: // reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) - rmm::device_uvector max_row_offsets(num_columns, stream); + rmm::device_uvector max_row_offsets(num_columns, stream); rmm::device_uvector column_categories(num_columns, stream); auto ordered_row_offsets = thrust::make_permutation_iterator(row_offsets.begin(), level_ordered_node_ids.begin()); @@ -133,37 +153,34 @@ unvalidated_column_tree reduce_to_column_tree_csr( auto type_a = thrust::get<1>(a); auto type_b = thrust::get<1>(b); - NodeT max_offset; + NodeT ctg; auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) // *+*=*, v+v=v if (type_a == type_b) { - max_offset = type_a; + ctg = type_a; } else if (is_a_leaf) { // *+v=*, N+V=N // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR - max_offset = type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + ctg = (type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b)); } else if (is_b_leaf) { - max_offset = type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + ctg = (type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a)); } - // *+#=E - max_offset = NC_ERR; + else ctg = NC_ERR; thrust::maximum row_offset_op; - return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), max_offset); + return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), ctg); }); // 4. construct parent_col_ids using permutation iterator rmm::device_uvector parent_col_ids(num_columns, stream); + thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(), parent_nodeids_to_colids{col_ids, rev_mapped_col_ids}); thrust::copy_n( rmm::exec_policy(stream), thrust::make_permutation_iterator(tree.parent_node_ids.begin(), level_ordered_unique_node_ids.begin()), num_columns, - thrust::make_transform_output_iterator(parent_col_ids.begin(), - [col_ids = col_ids.begin(), rev_mapped_col_ids_it] __device__(auto parent_node_id) -> NodeIndexT { - return parent_node_id == parent_node_sentinel ? parent_node_sentinel : rev_mapped_col_ids_it[col_ids[parent_node_id]]; - })); + parent_col_ids_it); /* 5. CSR construction: @@ -176,7 +193,6 @@ unvalidated_column_tree reduce_to_column_tree_csr( rmm::device_uvector rowidx(num_columns + 1, stream); thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); - // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) // children adjacency auto num_non_leaf_columns = thrust::unique_count( @@ -188,19 +204,18 @@ unvalidated_column_tree reduce_to_column_tree_csr( thrust::make_discard_iterator(), rowidx.begin() + 1, thrust::equal_to()); - thrust::inclusive_scan( - rmm::exec_policy(stream), rowidx.begin() + 1, rowidx.end(), rowidx.begin() + 1); - // We are discarding the parent of the root node. Add the parent adjacency. Since we have already - // performed the scan, we use a counting iterator to add - thrust::transform(rmm::exec_policy(stream), - rowidx.begin() + 2, - rowidx.end(), - thrust::make_counting_iterator(1), - rowidx.begin() + 2, - thrust::plus()); + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(1), rowidx.begin() + 1), + thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, rowidx.end()), + rowidx.begin() + 1, + cuda::proclaim_return_type([] __device__(auto a) { + auto n = thrust::get<0>(a); + auto idx = thrust::get<1>(a); + return n == 1 ? idx : idx + 1; + }), + thrust::plus{}); rmm::device_uvector colidx((num_columns - 1) * 2, stream); - // Skip the parent of root node thrust::scatter(rmm::exec_policy(stream), parent_col_ids.begin() + 1, @@ -232,102 +247,44 @@ unvalidated_column_tree reduce_to_column_tree_csr( map.begin(), colidx.begin()); - // condition is true if parent is not a list, or sentinel/root - // Special case to return true if parent is a list and is_array_of_arrays is true - auto is_non_list_parent = [column_categories = column_categories.begin(), - is_array_of_arrays, - row_array_parent_col_id] __device__(auto parent_col_id) -> bool { - return !(parent_col_id == parent_node_sentinel || - column_categories[parent_col_id] == NC_LIST && - (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); - }; // Mixed types in List children go to different columns, // so all immediate children of list column should have same max_row_offsets. // create list's children max_row_offsets array. (initialize to zero) // atomicMax on children max_row_offsets array. // gather the max_row_offsets from children row offset array. { - rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); - thrust::fill(rmm::exec_policy(stream), - list_parents_children_max_row_offsets.begin(), - list_parents_children_max_row_offsets.end(), - 0); - auto list_nodes = thrust::make_permutation_iterator + auto max_row_offsets_it = thrust::make_permutation_iterator(max_row_offsets.begin(), colidx.begin()); + rmm::device_uvector max_children_max_row_offsets(num_columns, stream); + size_t temp_storage_bytes = 0; + cub::DeviceSegmentedReduce::Max(nullptr, temp_storage_bytes, max_row_offsets_it, max_children_max_row_offsets.begin(), num_columns, rowidx.begin(), rowidx.begin() + 1, stream.value()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + cub::DeviceSegmentedReduce::Max(d_temp_storage.data(), temp_storage_bytes, max_row_offsets_it, max_children_max_row_offsets.begin(), num_columns, rowidx.begin(), rowidx.begin() + 1, stream.value()); - thrust::for_each(rmm::exec_policy(stream), - csr_unique_col_ids.begin(), - csr_unique_col_ids.end(), - [csr_column_categories = csr_column_categories.begin(), - csr_parent_col_ids = csr_parent_col_ids.begin(), - csr_max_row_offsets = csr_max_row_offsets.begin(), - list_parents_children_max_row_offsets = - list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { - auto csr_parent_col_id = csr_parent_col_ids[col_id]; - if (csr_parent_col_id != parent_node_sentinel and - csr_column_categories[csr_parent_col_id] == node_t::NC_LIST) { - cuda::atomic_ref ref{ - *(list_parents_children_max_row_offsets + csr_parent_col_id)}; - ref.fetch_max(csr_max_row_offsets[col_id], - cuda::std::memory_order_relaxed); - } - }); - thrust::gather_if( - rmm::exec_policy(stream), - csr_parent_col_ids.begin(), - csr_parent_col_ids.end(), - csr_parent_col_ids.begin(), - list_parents_children_max_row_offsets.begin(), - csr_max_row_offsets.begin(), - [csr_column_categories = csr_column_categories.begin()] __device__(size_type parent_col_id) { - return parent_col_id != parent_node_sentinel and - csr_column_categories[parent_col_id] == node_t::NC_LIST; - }); + rmm::device_uvector list_ancestors(num_columns, stream); + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), num_columns, + [rowidx = rowidx.begin(), + colidx = colidx.begin(), + column_categories = column_categories.begin(), + dev_num_levels_ptr, + list_ancestors = list_ancestors.begin()] __device__(NodeIndexT node) { + auto num_levels = *dev_num_levels_ptr; + list_ancestors[node] = node; + for(int level = 0; level < num_levels; level++) { + if(list_ancestors[node] > 0) + list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; + else list_ancestors[node] = -1; + if(list_ancestors[node] == -1 || column_categories[list_ancestors[node]] == NC_LIST) break; + } + }); + thrust::gather_if(rmm::exec_policy(stream), list_ancestors.begin(), list_ancestors.end(), list_ancestors.begin(), + max_children_max_row_offsets.begin(), max_row_offsets.begin(), + [] __device__(auto ancestor) { + return ancestor != -1; + }); } - // copy lists' max_row_offsets to children. - // all structs should have same size. - thrust::transform_if( - rmm::exec_policy(stream), - csr_unique_col_ids.begin(), - csr_unique_col_ids.end(), - csr_max_row_offsets.begin(), - [csr_column_categories = csr_column_categories.begin(), - is_non_list_parent, - csr_parent_col_ids = csr_parent_col_ids.begin(), - csr_max_row_offsets = csr_max_row_offsets.begin()] __device__(size_type col_id) { - auto parent_col_id = csr_parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - while (is_non_list_parent(parent_col_id)) { - col_id = parent_col_id; - parent_col_id = csr_parent_col_ids[parent_col_id]; - } - return csr_max_row_offsets[col_id]; - }, - [csr_column_categories = csr_column_categories.begin(), - is_non_list_parent, - parent_col_ids = csr_parent_col_ids.begin()] __device__(size_type col_id) { - auto parent_col_id = parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - return is_non_list_parent(parent_col_id); - }); - - // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) - thrust::transform_if( - rmm::exec_policy(stream), - csr_col_range_begin.begin(), - csr_col_range_begin.end(), - csr_column_categories.begin(), - csr_col_range_end.begin(), - [] __device__(auto i) { return i + 1; }, - [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); - - return std::tuple{column_tree_csr{std::move(rowidx), - std::move(colidx), - std::move(csr_unique_col_ids), - std::move(csr_column_categories), - std::move(csr_col_range_begin), - std::move(csr_col_range_end)}, - std::move(csr_max_row_offsets)}; + return std::tuple{csr{std::move(rowidx), std::move(colidx)}, + column_tree_properties{std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; } } // namespace cudf::io::json::experimental::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1e617240159..1d8f24af2fe 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -187,6 +187,20 @@ struct device_json_column { }; namespace experimental { +/* + * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. + */ +struct csr { + rmm::device_uvector rowidx; + rmm::device_uvector colidx; +}; + +struct column_tree_properties { + rmm::device_uvector categories; + rmm::device_uvector max_row_offsets; + rmm::device_uvector mapped_ids; +}; + /* * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json column * subtree - the subgraph that conforms to column tree properties - is extracted and further processed @@ -195,6 +209,7 @@ namespace experimental { */ struct column_tree { // position of nnzs + csr adjacency; rmm::device_uvector rowidx; rmm::device_uvector colidx; // device_json_column properties @@ -223,11 +238,9 @@ namespace detail { * in each column */ -std::tuple> reduce_to_column_tree_csr( +std::tuple reduce_to_column_tree( tree_meta_t& tree, device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 88187623930..a36facc7de7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -325,6 +325,7 @@ ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp) ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu) +ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 diff --git a/cpp/tests/io/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu similarity index 81% rename from cpp/tests/io/json_tree_csr.cu rename to cpp/tests/io/json/json_tree_csr.cu index 594d7dde640..18e4cca136e 100644 --- a/cpp/tests/io/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -45,15 +45,13 @@ struct h_tree_meta_t { std::vector node_range_end; }; -struct h_column_tree_csr { +struct h_column_tree { // position of nnzs std::vector rowidx; std::vector colidx; // node properties - std::vector column_ids; std::vector categories; - std::vector range_begin; - std::vector range_end; + std::vector column_ids; }; template @@ -66,7 +64,8 @@ void print(std::string str, std::vector& vec) } bool check_equality(cuio_json::tree_meta_t& d_a, - cuio_json::experimental::column_tree_csr& d_b, + cuio_json::experimental::csr& d_b_csr, + cuio_json::experimental::column_tree_properties& d_b_ctp, rmm::cuda_stream_view stream) { // convert from tree_meta_t to column_tree_csr @@ -75,34 +74,40 @@ bool check_equality(cuio_json::tree_meta_t& d_a, cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), cudf::detail::make_std_vector_async(d_a.node_range_end, stream)}; - h_column_tree_csr b{cudf::detail::make_std_vector_async(d_b.rowidx, stream), - cudf::detail::make_std_vector_async(d_b.colidx, stream), - cudf::detail::make_std_vector_async(d_b.column_ids, stream), - cudf::detail::make_std_vector_async(d_b.categories, stream), - cudf::detail::make_std_vector_async(d_b.range_begin, stream), - cudf::detail::make_std_vector_async(d_b.range_end, stream)}; + h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.rowidx, stream), + cudf::detail::make_std_vector_async(d_b_csr.colidx, stream), + cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), + cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; stream.synchronize(); auto num_nodes = a.parent_node_ids.size(); - if (b.rowidx.size() != num_nodes + 1) return false; + if (b.rowidx.size() != num_nodes + 1) { + return false; + } for (auto pos = b.rowidx[0]; pos < b.rowidx[1]; pos++) { auto v = b.colidx[pos]; - if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) return false; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { + return false; + } } for (size_t u = 1; u < num_nodes; u++) { auto v = b.colidx[b.rowidx[u]]; - if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) return false; + if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { + return false; + } for (auto pos = b.rowidx[u] + 1; pos < b.rowidx[u + 1]; pos++) { v = b.colidx[pos]; - if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) return false; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { + return false; + } } } for (size_t u = 0; u < num_nodes; u++) { - if (a.node_categories[b.column_ids[u]] != b.categories[u]) return false; - if (a.node_range_begin[b.column_ids[u]] != b.range_begin[u]) return false; - if (a.node_range_end[b.column_ids[u]] != b.range_end[u]) return false; + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { + return false; + } } return true; } @@ -174,18 +179,15 @@ TEST_F(JsonColumnTreeTests, SimpleLines) row_array_parent_col_id, stream); - auto [d_column_tree_csr, d_max_row_offsets_csr] = - cudf::io::json::experimental::detail::reduce_to_column_tree_csr(gpu_tree, + auto [d_column_tree_csr, d_column_tree_properties] = + cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree, gpu_col_id, - sorted_col_ids, - node_ids, gpu_row_offsets, false, row_array_parent_col_id, stream); - auto iseq = check_equality(d_column_tree, d_column_tree_csr, stream); - std::cout << "iseq = " << iseq << std::endl; + auto iseq = check_equality(d_column_tree, d_column_tree_csr, d_column_tree_properties, stream); // assert equality between csr and meta formats assert(iseq == true); } From 1ce88be731323aef901bcea5cb0049bd1ff9f5a0 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 30 Jul 2024 22:08:45 +0000 Subject: [PATCH 19/28] formatting --- cpp/src/io/json/json_column_csr.cu | 259 +++++++++++++++++------------ cpp/src/io/json/nested_json.hpp | 10 +- cpp/tests/io/json/json_tree_csr.cu | 34 ++-- 3 files changed, 170 insertions(+), 133 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 3bec413435c..98edf6faf33 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -51,8 +52,6 @@ #include #include -#include - namespace cudf::io::json::experimental::detail { using row_offset_t = size_type; @@ -63,23 +62,27 @@ struct level_ordering { __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const { return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) || - (node_levels[lhs_node_id] == node_levels[rhs_node_id] && col_ids[lhs_node_id] < col_ids[rhs_node_id]); + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && + col_ids[lhs_node_id] < col_ids[rhs_node_id]); } }; struct parent_nodeids_to_colids { device_span col_ids; device_span rev_mapped_col_ids; - __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT { - return parent_node_id == parent_node_sentinel ? parent_node_sentinel : rev_mapped_col_ids[col_ids[parent_node_id]]; + __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT + { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : rev_mapped_col_ids[col_ids[parent_node_id]]; } }; template -void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) { +void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) +{ auto h_vec = cudf::detail::make_std_vector_async(d_vec, stream); std::cout << name << " = "; - for(auto e : h_vec) { + for (auto e : h_vec) { std::cout << e << " "; } std::cout << std::endl; @@ -111,76 +114,99 @@ std::tuple reduce_to_column_tree( rmm::device_uvector level_ordered_col_ids(col_ids.size(), stream); rmm::device_uvector level_ordered_node_ids(col_ids.size(), stream); - thrust::copy(rmm::exec_policy_nosync(stream), col_ids.begin(), col_ids.end(), level_ordered_col_ids.begin()); - thrust::sequence(rmm::exec_policy_nosync(stream), level_ordered_node_ids.begin(), level_ordered_node_ids.end()); + thrust::copy( + rmm::exec_policy_nosync(stream), col_ids.begin(), col_ids.end(), level_ordered_col_ids.begin()); + thrust::sequence( + rmm::exec_policy_nosync(stream), level_ordered_node_ids.begin(), level_ordered_node_ids.end()); // Reorder nodes and column ids in level-wise fashion - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), level_ordered_node_ids.begin(), level_ordered_node_ids.end(), - level_ordered_col_ids.begin(), level_ordering{tree.node_levels, col_ids}); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + level_ordered_node_ids.begin(), + level_ordered_node_ids.end(), + level_ordered_col_ids.begin(), + level_ordering{tree.node_levels, col_ids}); - // 1. get the number of columns in tree, mapping between node tree col ids and csr col ids, and the node id of first row in each column - auto const num_columns = - thrust::unique_count(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end()); + // 1. get the number of columns in tree, mapping between node tree col ids and csr col ids, and + // the node id of first row in each column + auto const num_columns = thrust::unique_count( + rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end()); rmm::device_uvector level_ordered_unique_node_ids(num_columns, stream); rmm::device_uvector mapped_col_ids(num_columns, stream); rmm::device_uvector rev_mapped_col_ids(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_node_ids.end(), level_ordered_node_ids.begin(), mapped_col_ids.begin(), level_ordered_unique_node_ids.begin()); - auto *dev_num_levels_ptr = thrust::max_element(rmm::exec_policy(stream), tree.node_levels.begin(), tree.node_levels.end()); + thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), + level_ordered_col_ids.begin(), + level_ordered_node_ids.end(), + level_ordered_node_ids.begin(), + mapped_col_ids.begin(), + level_ordered_unique_node_ids.begin()); + auto* dev_num_levels_ptr = + thrust::max_element(rmm::exec_policy(stream), tree.node_levels.begin(), tree.node_levels.end()); rmm::device_uvector mapped_col_ids_copy(num_columns, stream); - thrust::copy(rmm::exec_policy(stream), mapped_col_ids.begin(), mapped_col_ids.end(), mapped_col_ids_copy.begin()); + thrust::copy(rmm::exec_policy(stream), + mapped_col_ids.begin(), + mapped_col_ids.end(), + mapped_col_ids_copy.begin()); thrust::sequence(rmm::exec_policy(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); - thrust::sort_by_key(rmm::exec_policy(stream), mapped_col_ids_copy.begin(), mapped_col_ids_copy.end(), rev_mapped_col_ids.begin()); + thrust::sort_by_key(rmm::exec_policy(stream), + mapped_col_ids_copy.begin(), + mapped_col_ids_copy.end(), + rev_mapped_col_ids.begin()); // 2. maximum number of rows per column: computed with reduce_by_key {col_id}, {row_offset}, max. - // 3. category for each column node by aggregating all nodes in node tree corresponding to same column: + // 3. category for each column node by aggregating all nodes in node tree corresponding to same + // column: // reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) rmm::device_uvector max_row_offsets(num_columns, stream); rmm::device_uvector column_categories(num_columns, stream); auto ordered_row_offsets = thrust::make_permutation_iterator(row_offsets.begin(), level_ordered_node_ids.begin()); - auto ordered_node_categories = thrust::make_permutation_iterator(tree.node_categories.begin(), level_ordered_node_ids.begin()); - thrust::reduce_by_key(rmm::exec_policy(stream), - level_ordered_col_ids.begin(), - level_ordered_col_ids.end(), - thrust::make_zip_iterator(thrust::make_tuple(ordered_row_offsets, ordered_node_categories)), - thrust::make_discard_iterator(), - thrust::make_zip_iterator(thrust::make_tuple(max_row_offsets.begin(), column_categories.begin())), - thrust::equal_to(), - [] __device__(auto a, auto b) { - auto row_offset_a = thrust::get<0>(a); - auto row_offset_b = thrust::get<0>(b); - auto type_a = thrust::get<1>(a); - auto type_b = thrust::get<1>(b); - - NodeT ctg; - auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); - auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); - // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) - // *+*=*, v+v=v - if (type_a == type_b) { - ctg = type_a; - } else if (is_a_leaf) { - // *+v=*, N+V=N - // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR - ctg = (type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b)); - } else if (is_b_leaf) { - ctg = (type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a)); - } - else ctg = NC_ERR; + auto ordered_node_categories = + thrust::make_permutation_iterator(tree.node_categories.begin(), level_ordered_node_ids.begin()); + thrust::reduce_by_key( + rmm::exec_policy(stream), + level_ordered_col_ids.begin(), + level_ordered_col_ids.end(), + thrust::make_zip_iterator(thrust::make_tuple(ordered_row_offsets, ordered_node_categories)), + thrust::make_discard_iterator(), + thrust::make_zip_iterator( + thrust::make_tuple(max_row_offsets.begin(), column_categories.begin())), + thrust::equal_to(), + [] __device__(auto a, auto b) { + auto row_offset_a = thrust::get<0>(a); + auto row_offset_b = thrust::get<0>(b); + auto type_a = thrust::get<1>(a); + auto type_b = thrust::get<1>(b); + + NodeT ctg; + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + ctg = type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + ctg = (type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b)); + } else if (is_b_leaf) { + ctg = (type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a)); + } else + ctg = NC_ERR; - thrust::maximum row_offset_op; - return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), ctg); - }); + thrust::maximum row_offset_op; + return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), ctg); + }); // 4. construct parent_col_ids using permutation iterator rmm::device_uvector parent_col_ids(num_columns, stream); - thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(), parent_nodeids_to_colids{col_ids, rev_mapped_col_ids}); - thrust::copy_n( - rmm::exec_policy(stream), - thrust::make_permutation_iterator(tree.parent_node_ids.begin(), level_ordered_unique_node_ids.begin()), - num_columns, - parent_col_ids_it); + thrust::transform_output_iterator parent_col_ids_it( + parent_col_ids.begin(), parent_nodeids_to_colids{col_ids, rev_mapped_col_ids}); + thrust::copy_n(rmm::exec_policy(stream), + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), + level_ordered_unique_node_ids.begin()), + num_columns, + parent_col_ids_it); /* 5. CSR construction: @@ -204,16 +230,17 @@ std::tuple reduce_to_column_tree( thrust::make_discard_iterator(), rowidx.begin() + 1, thrust::equal_to()); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(1), rowidx.begin() + 1), - thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, rowidx.end()), - rowidx.begin() + 1, - cuda::proclaim_return_type([] __device__(auto a) { - auto n = thrust::get<0>(a); - auto idx = thrust::get<1>(a); - return n == 1 ? idx : idx + 1; - }), - thrust::plus{}); + thrust::transform_inclusive_scan( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(1), rowidx.begin() + 1), + thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, rowidx.end()), + rowidx.begin() + 1, + cuda::proclaim_return_type([] __device__(auto a) { + auto n = thrust::get<0>(a); + auto idx = thrust::get<1>(a); + return n == 1 ? idx : idx + 1; + }), + thrust::plus{}); rmm::device_uvector colidx((num_columns - 1) * 2, stream); // Skip the parent of root node @@ -226,21 +253,21 @@ std::tuple reduce_to_column_tree( rmm::device_uvector map(num_columns - 1, stream); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), parent_col_ids.begin() + 1, - parent_col_ids.end(), + parent_col_ids.end(), thrust::make_constant_iterator(1), map.begin()); thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(1), - num_columns - 1, - [rowidx = rowidx.begin(), - map = map.begin(), - parent_col_ids = parent_col_ids.begin()] __device__(auto i) { - auto parent_col_id = parent_col_ids[i]; - if (parent_col_id == 0) - map[i - 1]--; - else - map[i - 1] += rowidx[parent_col_id]; - }); + thrust::make_counting_iterator(1), + num_columns - 1, + [rowidx = rowidx.begin(), + map = map.begin(), + parent_col_ids = parent_col_ids.begin()] __device__(auto i) { + auto parent_col_id = parent_col_ids[i]; + if (parent_col_id == 0) + map[i - 1]--; + else + map[i - 1] += rowidx[parent_col_id]; + }); thrust::scatter(rmm::exec_policy(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, @@ -253,38 +280,62 @@ std::tuple reduce_to_column_tree( // atomicMax on children max_row_offsets array. // gather the max_row_offsets from children row offset array. { - auto max_row_offsets_it = thrust::make_permutation_iterator(max_row_offsets.begin(), colidx.begin()); + auto max_row_offsets_it = + thrust::make_permutation_iterator(max_row_offsets.begin(), colidx.begin()); rmm::device_uvector max_children_max_row_offsets(num_columns, stream); size_t temp_storage_bytes = 0; - cub::DeviceSegmentedReduce::Max(nullptr, temp_storage_bytes, max_row_offsets_it, max_children_max_row_offsets.begin(), num_columns, rowidx.begin(), rowidx.begin() + 1, stream.value()); + cub::DeviceSegmentedReduce::Max(nullptr, + temp_storage_bytes, + max_row_offsets_it, + max_children_max_row_offsets.begin(), + num_columns, + rowidx.begin(), + rowidx.begin() + 1, + stream.value()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - cub::DeviceSegmentedReduce::Max(d_temp_storage.data(), temp_storage_bytes, max_row_offsets_it, max_children_max_row_offsets.begin(), num_columns, rowidx.begin(), rowidx.begin() + 1, stream.value()); + cub::DeviceSegmentedReduce::Max(d_temp_storage.data(), + temp_storage_bytes, + max_row_offsets_it, + max_children_max_row_offsets.begin(), + num_columns, + rowidx.begin(), + rowidx.begin() + 1, + stream.value()); rmm::device_uvector list_ancestors(num_columns, stream); - thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), num_columns, - [rowidx = rowidx.begin(), - colidx = colidx.begin(), - column_categories = column_categories.begin(), - dev_num_levels_ptr, - list_ancestors = list_ancestors.begin()] __device__(NodeIndexT node) { - auto num_levels = *dev_num_levels_ptr; - list_ancestors[node] = node; - for(int level = 0; level < num_levels; level++) { - if(list_ancestors[node] > 0) - list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; - else list_ancestors[node] = -1; - if(list_ancestors[node] == -1 || column_categories[list_ancestors[node]] == NC_LIST) break; - } - }); - thrust::gather_if(rmm::exec_policy(stream), list_ancestors.begin(), list_ancestors.end(), list_ancestors.begin(), - max_children_max_row_offsets.begin(), max_row_offsets.begin(), - [] __device__(auto ancestor) { - return ancestor != -1; - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_columns, + [rowidx = rowidx.begin(), + colidx = colidx.begin(), + column_categories = column_categories.begin(), + dev_num_levels_ptr, + list_ancestors = list_ancestors.begin()] __device__(NodeIndexT node) { + auto num_levels = *dev_num_levels_ptr; + list_ancestors[node] = node; + for (int level = 0; level < num_levels; level++) { + if (list_ancestors[node] > 0) + list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; + else + list_ancestors[node] = -1; + if (list_ancestors[node] == -1 || column_categories[list_ancestors[node]] == NC_LIST) + break; + } + }); + thrust::gather_if(rmm::exec_policy(stream), + list_ancestors.begin(), + list_ancestors.end(), + list_ancestors.begin(), + max_children_max_row_offsets.begin(), + max_row_offsets.begin(), + [] __device__(auto ancestor) { return ancestor != -1; }); } - return std::tuple{csr{std::move(rowidx), std::move(colidx)}, - column_tree_properties{std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; + return std::tuple{ + csr{std::move(rowidx), std::move(colidx)}, + column_tree_properties{ + std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; } } // namespace cudf::io::json::experimental::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1d8f24af2fe..f4ecb3b97b4 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -188,7 +188,7 @@ struct device_json_column { namespace experimental { /* - * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. + * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. */ struct csr { rmm::device_uvector rowidx; @@ -202,10 +202,10 @@ struct column_tree_properties { }; /* - * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json column - * subtree - the subgraph that conforms to column tree properties - is extracted and further processed - * according to the JSON reader options passed. Only the final processed subgraph is annotated with information - * required to construct cuDF columns. + * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json + * column subtree - the subgraph that conforms to column tree properties - is extracted and further + * processed according to the JSON reader options passed. Only the final processed subgraph is + * annotated with information required to construct cuDF columns. */ struct column_tree { // position of nnzs diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index 18e4cca136e..bb3f835177f 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -75,39 +75,29 @@ bool check_equality(cuio_json::tree_meta_t& d_a, cudf::detail::make_std_vector_async(d_a.node_range_end, stream)}; h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.rowidx, stream), - cudf::detail::make_std_vector_async(d_b_csr.colidx, stream), - cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), - cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; + cudf::detail::make_std_vector_async(d_b_csr.colidx, stream), + cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), + cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; stream.synchronize(); auto num_nodes = a.parent_node_ids.size(); - if (b.rowidx.size() != num_nodes + 1) { - return false; - } + if (b.rowidx.size() != num_nodes + 1) { return false; } for (auto pos = b.rowidx[0]; pos < b.rowidx[1]; pos++) { auto v = b.colidx[pos]; - if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { - return false; - } + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; } } for (size_t u = 1; u < num_nodes; u++) { auto v = b.colidx[b.rowidx[u]]; - if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { - return false; - } + if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; } for (auto pos = b.rowidx[u] + 1; pos < b.rowidx[u + 1]; pos++) { v = b.colidx[pos]; - if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { - return false; - } + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; } } } for (size_t u = 0; u < num_nodes; u++) { - if (a.node_categories[b.column_ids[u]] != b.categories[u]) { - return false; - } + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } } return true; } @@ -180,12 +170,8 @@ TEST_F(JsonColumnTreeTests, SimpleLines) stream); auto [d_column_tree_csr, d_column_tree_properties] = - cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree, - gpu_col_id, - gpu_row_offsets, - false, - row_array_parent_col_id, - stream); + cudf::io::json::experimental::detail::reduce_to_column_tree( + gpu_tree, gpu_col_id, gpu_row_offsets, false, row_array_parent_col_id, stream); auto iseq = check_equality(d_column_tree, d_column_tree_csr, d_column_tree_properties, stream); // assert equality between csr and meta formats From d6d724ca4f624f2d41b658c5d4577c56fe62a853 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 30 Jul 2024 22:22:09 +0000 Subject: [PATCH 20/28] exec policy is no sync --- cpp/src/io/json/json_column_csr.cu | 35 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index 98edf6faf33..b7968a636c2 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -139,16 +139,17 @@ std::tuple reduce_to_column_tree( level_ordered_node_ids.begin(), mapped_col_ids.begin(), level_ordered_unique_node_ids.begin()); - auto* dev_num_levels_ptr = - thrust::max_element(rmm::exec_policy(stream), tree.node_levels.begin(), tree.node_levels.end()); + auto* dev_num_levels_ptr = thrust::max_element( + rmm::exec_policy_nosync(stream), tree.node_levels.begin(), tree.node_levels.end()); rmm::device_uvector mapped_col_ids_copy(num_columns, stream); - thrust::copy(rmm::exec_policy(stream), + thrust::copy(rmm::exec_policy_nosync(stream), mapped_col_ids.begin(), mapped_col_ids.end(), mapped_col_ids_copy.begin()); - thrust::sequence(rmm::exec_policy(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); - thrust::sort_by_key(rmm::exec_policy(stream), + thrust::sequence( + rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); + thrust::sort_by_key(rmm::exec_policy_nosync(stream), mapped_col_ids_copy.begin(), mapped_col_ids_copy.end(), rev_mapped_col_ids.begin()); @@ -164,7 +165,7 @@ std::tuple reduce_to_column_tree( auto ordered_node_categories = thrust::make_permutation_iterator(tree.node_categories.begin(), level_ordered_node_ids.begin()); thrust::reduce_by_key( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end(), thrust::make_zip_iterator(thrust::make_tuple(ordered_row_offsets, ordered_node_categories)), @@ -202,7 +203,7 @@ std::tuple reduce_to_column_tree( rmm::device_uvector parent_col_ids(num_columns, stream); thrust::transform_output_iterator parent_col_ids_it( parent_col_ids.begin(), parent_nodeids_to_colids{col_ids, rev_mapped_col_ids}); - thrust::copy_n(rmm::exec_policy(stream), + thrust::copy_n(rmm::exec_policy_nosync(stream), thrust::make_permutation_iterator(tree.parent_node_ids.begin(), level_ordered_unique_node_ids.begin()), num_columns, @@ -218,12 +219,12 @@ std::tuple reduce_to_column_tree( */ rmm::device_uvector rowidx(num_columns + 1, stream); - thrust::fill(rmm::exec_policy(stream), rowidx.begin(), rowidx.end(), 0); + thrust::fill(rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), 0); // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) // children adjacency auto num_non_leaf_columns = thrust::unique_count( - rmm::exec_policy(stream), parent_col_ids.begin() + 1, parent_col_ids.end()); - thrust::reduce_by_key(rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end()); + thrust::reduce_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end(), thrust::make_constant_iterator(1), @@ -231,7 +232,7 @@ std::tuple reduce_to_column_tree( rowidx.begin() + 1, thrust::equal_to()); thrust::transform_inclusive_scan( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator(thrust::make_counting_iterator(1), rowidx.begin() + 1), thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, rowidx.end()), rowidx.begin() + 1, @@ -244,19 +245,19 @@ std::tuple reduce_to_column_tree( rmm::device_uvector colidx((num_columns - 1) * 2, stream); // Skip the parent of root node - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end(), rowidx.begin() + 1, colidx.begin()); // excluding root node, construct scatter map rmm::device_uvector map(num_columns - 1, stream); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end(), thrust::make_constant_iterator(1), map.begin()); - thrust::for_each_n(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(1), num_columns - 1, [rowidx = rowidx.begin(), @@ -268,7 +269,7 @@ std::tuple reduce_to_column_tree( else map[i - 1] += rowidx[parent_col_id]; }); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(1), thrust::make_counting_iterator(1) + num_columns - 1, map.begin(), @@ -304,7 +305,7 @@ std::tuple reduce_to_column_tree( rmm::device_uvector list_ancestors(num_columns, stream); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), num_columns, [rowidx = rowidx.begin(), @@ -323,7 +324,7 @@ std::tuple reduce_to_column_tree( break; } }); - thrust::gather_if(rmm::exec_policy(stream), + thrust::gather_if(rmm::exec_policy_nosync(stream), list_ancestors.begin(), list_ancestors.end(), list_ancestors.begin(), From 2622d6bfaa18341495af1a68dc616fcf8f493d08 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Tue, 30 Jul 2024 23:38:37 +0000 Subject: [PATCH 21/28] fix copyright year --- cpp/src/io/json/json_column_csr.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/json_column_csr.cu index b7968a636c2..74f065bddfa 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/json_column_csr.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 94983728e04a0eb492e795140967152c977527b8 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 31 Jul 2024 05:36:09 +0000 Subject: [PATCH 22/28] fixing max row offsets --- cpp/CMakeLists.txt | 2 +- ...umn_csr.cu => column_tree_construction.cu} | 256 ++++++++++++++++-- cpp/src/io/json/json_column.cu | 199 -------------- cpp/src/io/json/nested_json.hpp | 23 +- cpp/tests/io/json/json_tree_csr.cu | 11 +- 5 files changed, 254 insertions(+), 237 deletions(-) rename cpp/src/io/json/{json_column_csr.cu => column_tree_construction.cu} (58%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d06cfa39ade..cbf87fc67ed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -393,7 +393,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/json_column_csr.cu + src/io/json/column_tree_construction.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu diff --git a/cpp/src/io/json/json_column_csr.cu b/cpp/src/io/json/column_tree_construction.cu similarity index 58% rename from cpp/src/io/json/json_column_csr.cu rename to cpp/src/io/json/column_tree_construction.cu index 74f065bddfa..5ef21096ac9 100644 --- a/cpp/src/io/json/json_column_csr.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -52,7 +52,22 @@ #include #include -namespace cudf::io::json::experimental::detail { +namespace cudf::io::json { + +template +void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) +{ + stream.synchronize(); + auto h_vec = cudf::detail::make_std_vector_async(d_vec, stream); + stream.synchronize(); + std::cout << name << " = "; + for (auto e : h_vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} + +namespace experimental::detail { using row_offset_t = size_type; @@ -77,17 +92,6 @@ struct parent_nodeids_to_colids { } }; -template -void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) -{ - auto h_vec = cudf::detail::make_std_vector_async(d_vec, stream); - std::cout << name << " = "; - for (auto e : h_vec) { - std::cout << e << " "; - } - std::cout << std::endl; -} - /** * @brief Reduces node tree representation to column tree CSR representation. * @@ -168,10 +172,9 @@ std::tuple reduce_to_column_tree( rmm::exec_policy_nosync(stream), level_ordered_col_ids.begin(), level_ordered_col_ids.end(), - thrust::make_zip_iterator(thrust::make_tuple(ordered_row_offsets, ordered_node_categories)), + thrust::make_zip_iterator(ordered_row_offsets, ordered_node_categories), thrust::make_discard_iterator(), - thrust::make_zip_iterator( - thrust::make_tuple(max_row_offsets.begin(), column_categories.begin())), + thrust::make_zip_iterator(max_row_offsets.begin(), column_categories.begin()), thrust::equal_to(), [] __device__(auto a, auto b) { auto row_offset_a = thrust::get<0>(a); @@ -195,8 +198,8 @@ std::tuple reduce_to_column_tree( } else ctg = NC_ERR; - thrust::maximum row_offset_op; - return thrust::make_tuple(row_offset_op(row_offset_a, row_offset_b), ctg); + thrust::maximum row_offset_op; + return thrust::make_pair(row_offset_op(row_offset_a, row_offset_b), ctg); }); // 4. construct parent_col_ids using permutation iterator @@ -277,8 +280,7 @@ std::tuple reduce_to_column_tree( // Mixed types in List children go to different columns, // so all immediate children of list column should have same max_row_offsets. - // create list's children max_row_offsets array. (initialize to zero) - // atomicMax on children max_row_offsets array. + // create list's children max_row_offsets array // gather the max_row_offsets from children row offset array. { auto max_row_offsets_it = @@ -315,15 +317,14 @@ std::tuple reduce_to_column_tree( list_ancestors = list_ancestors.begin()] __device__(NodeIndexT node) { auto num_levels = *dev_num_levels_ptr; list_ancestors[node] = node; - for (int level = 0; level < num_levels; level++) { + for (int level = 0; level <= num_levels; level++) { if (list_ancestors[node] > 0) list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; - else - list_ancestors[node] = -1; - if (list_ancestors[node] == -1 || column_categories[list_ancestors[node]] == NC_LIST) + if (list_ancestors[node] == 0 || column_categories[list_ancestors[node]] == NC_LIST) break; } }); + thrust::gather_if(rmm::exec_policy_nosync(stream), list_ancestors.begin(), list_ancestors.end(), @@ -339,4 +340,211 @@ std::tuple reduce_to_column_tree( std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; } -} // namespace cudf::io::json::experimental::detail +} // namespace experimental::detail + +namespace detail { +/** + * @brief Reduces node tree representation to column tree representation. + * + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple, rmm::device_uvector> +reduce_to_column_tree(tree_meta_t& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // 1. column count for allocation + auto const num_columns = + thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); + + // 2. reduce_by_key {col_id}, {row_offset}, max. + rmm::device_uvector unique_col_ids(num_columns, stream); + rmm::device_uvector max_row_offsets(num_columns, stream); + auto ordered_row_offsets = + thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); + thrust::reduce_by_key(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_row_offsets, + unique_col_ids.begin(), + max_row_offsets.begin(), + thrust::equal_to(), + thrust::maximum()); + + // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) + rmm::device_uvector column_categories(num_columns, stream); + thrust::reduce_by_key( + rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), + unique_col_ids.begin(), + column_categories.begin(), + thrust::equal_to(), + [] __device__(NodeT type_a, NodeT type_b) -> NodeT { + auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); + auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); + // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) + // *+*=*, v+v=v + if (type_a == type_b) { + return type_a; + } else if (is_a_leaf) { + // *+v=*, N+V=N + // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR + return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); + } else if (is_b_leaf) { + return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); + } + // *+#=E + return NC_ERR; + }); + + // 4. unique_copy parent_node_ids, ranges + rmm::device_uvector column_levels(0, stream); // not required + rmm::device_uvector parent_col_ids(num_columns, stream); + rmm::device_uvector col_range_begin(num_columns, stream); // Field names + rmm::device_uvector col_range_end(num_columns, stream); + rmm::device_uvector unique_node_ids(num_columns, stream); + thrust::unique_by_key_copy(rmm::exec_policy(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + ordered_node_ids.begin(), + thrust::make_discard_iterator(), + unique_node_ids.begin()); + thrust::copy_n( + rmm::exec_policy(stream), + thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()), + thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())), + unique_node_ids.size(), + thrust::make_zip_iterator( + parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin())); + + // convert parent_node_ids to parent_col_ids + thrust::transform( + rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.end(), + parent_col_ids.begin(), + [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_id]; + }); + + // condition is true if parent is not a list, or sentinel/root + // Special case to return true if parent is a list and is_array_of_arrays is true + auto is_non_list_parent = [column_categories = column_categories.begin(), + is_array_of_arrays, + row_array_parent_col_id] __device__(auto parent_col_id) -> bool { + return !(parent_col_id == parent_node_sentinel || + column_categories[parent_col_id] == NC_LIST && + (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); + return (parent_col_id != parent_node_sentinel) && + (column_categories[parent_col_id] != NC_LIST) || + (is_array_of_arrays == true && parent_col_id == row_array_parent_col_id); + }; + + // Mixed types in List children go to different columns, + // so all immediate children of list column should have same max_row_offsets. + // create list's children max_row_offsets array. (initialize to zero) + // atomicMax on children max_row_offsets array. + // gather the max_row_offsets from children row offset array. + { + rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); + thrust::fill(rmm::exec_policy(stream), + list_parents_children_max_row_offsets.begin(), + list_parents_children_max_row_offsets.end(), + 0); + thrust::for_each(rmm::exec_policy(stream), + unique_col_ids.begin(), + unique_col_ids.end(), + [column_categories = column_categories.begin(), + parent_col_ids = parent_col_ids.begin(), + max_row_offsets = max_row_offsets.begin(), + list_parents_children_max_row_offsets = + list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { + auto parent_col_id = parent_col_ids[col_id]; + if (parent_col_id != parent_node_sentinel and + column_categories[parent_col_id] == node_t::NC_LIST) { + cuda::atomic_ref ref{ + *(list_parents_children_max_row_offsets + parent_col_id)}; + ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed); + } + }); + thrust::gather_if( + rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.end(), + parent_col_ids.begin(), + list_parents_children_max_row_offsets.begin(), + max_row_offsets.begin(), + [column_categories = column_categories.begin()] __device__(size_type parent_col_id) { + return parent_col_id != parent_node_sentinel and + column_categories[parent_col_id] == node_t::NC_LIST; + }); + } + + // copy lists' max_row_offsets to children. + // all structs should have same size. + thrust::transform_if( + rmm::exec_policy(stream), + unique_col_ids.begin(), + unique_col_ids.end(), + max_row_offsets.begin(), + [column_categories = column_categories.begin(), + is_non_list_parent, + parent_col_ids = parent_col_ids.begin(), + max_row_offsets = max_row_offsets.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + while (is_non_list_parent(parent_col_id)) { + col_id = parent_col_id; + parent_col_id = parent_col_ids[parent_col_id]; + } + return max_row_offsets[col_id]; + }, + [column_categories = column_categories.begin(), + is_non_list_parent, + parent_col_ids = parent_col_ids.begin()] __device__(size_type col_id) { + auto parent_col_id = parent_col_ids[col_id]; + // condition is true if parent is not a list, or sentinel/root + return is_non_list_parent(parent_col_id); + }); + + // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) + thrust::transform_if( + rmm::exec_policy(stream), + col_range_begin.begin(), + col_range_begin.end(), + column_categories.begin(), + col_range_end.begin(), + [] __device__(auto i) { return i + 1; }, + [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + + return std::tuple{tree_meta_t{std::move(column_categories), + std::move(parent_col_ids), + std::move(column_levels), + std::move(col_range_begin), + std::move(col_range_end)}, + std::move(unique_col_ids), + std::move(max_row_offsets)}; +} + +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 70493b90575..2e3c5746520 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -98,205 +98,6 @@ void print_tree(host_span input, printf(" (JSON)\n"); } -/** - * @brief Reduces node tree representation to column tree representation. - * - * @param tree Node tree representation of JSON string - * @param original_col_ids Column ids of nodes - * @param sorted_col_ids Sorted column ids of nodes - * @param ordered_node_ids Node ids of nodes sorted by column ids - * @param row_offsets Row offsets of nodes - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true - * @param stream CUDA stream used for device memory operations and kernel launches - * @return A tuple of column tree representation of JSON string, column ids of columns, and - * max row offsets of columns - */ -std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, - bool is_array_of_arrays, - NodeIndexT const row_array_parent_col_id, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - // 1. column count for allocation - auto const num_columns = - thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); - - // 2. reduce_by_key {col_id}, {row_offset}, max. - rmm::device_uvector unique_col_ids(num_columns, stream); - rmm::device_uvector max_row_offsets(num_columns, stream); - auto ordered_row_offsets = - thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin()); - thrust::reduce_by_key(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_row_offsets, - unique_col_ids.begin(), - max_row_offsets.begin(), - thrust::equal_to(), - thrust::maximum()); - - // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) - rmm::device_uvector column_categories(num_columns, stream); - thrust::reduce_by_key( - rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), - unique_col_ids.begin(), - column_categories.begin(), - thrust::equal_to(), - [] __device__(NodeT type_a, NodeT type_b) -> NodeT { - auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); - auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); - // (v+v=v, *+*=*, *+v=*, *+#=E, NESTED+VAL=NESTED) - // *+*=*, v+v=v - if (type_a == type_b) { - return type_a; - } else if (is_a_leaf) { - // *+v=*, N+V=N - // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR - return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b); - } else if (is_b_leaf) { - return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a); - } - // *+#=E - return NC_ERR; - }); - - // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector column_levels(0, stream); // not required - rmm::device_uvector parent_col_ids(num_columns, stream); - rmm::device_uvector col_range_begin(num_columns, stream); // Field names - rmm::device_uvector col_range_end(num_columns, stream); - rmm::device_uvector unique_node_ids(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - ordered_node_ids.begin(), - thrust::make_discard_iterator(), - unique_node_ids.begin()); - thrust::copy_n( - rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()), - thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())), - unique_node_ids.size(), - thrust::make_zip_iterator( - parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin())); - - // convert parent_node_ids to parent_col_ids - thrust::transform( - rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.end(), - parent_col_ids.begin(), - [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type { - return parent_node_id == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_id]; - }); - - // condition is true if parent is not a list, or sentinel/root - // Special case to return true if parent is a list and is_array_of_arrays is true - auto is_non_list_parent = [column_categories = column_categories.begin(), - is_array_of_arrays, - row_array_parent_col_id] __device__(auto parent_col_id) -> bool { - return !(parent_col_id == parent_node_sentinel || - column_categories[parent_col_id] == NC_LIST && - (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); - }; - // Mixed types in List children go to different columns, - // so all immediate children of list column should have same max_row_offsets. - // create list's children max_row_offsets array. (initialize to zero) - // atomicMax on children max_row_offsets array. - // gather the max_row_offsets from children row offset array. - { - rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); - thrust::fill(rmm::exec_policy(stream), - list_parents_children_max_row_offsets.begin(), - list_parents_children_max_row_offsets.end(), - 0); - thrust::for_each(rmm::exec_policy(stream), - unique_col_ids.begin(), - unique_col_ids.end(), - [column_categories = column_categories.begin(), - parent_col_ids = parent_col_ids.begin(), - max_row_offsets = max_row_offsets.begin(), - list_parents_children_max_row_offsets = - list_parents_children_max_row_offsets.begin()] __device__(auto col_id) { - auto parent_col_id = parent_col_ids[col_id]; - if (parent_col_id != parent_node_sentinel and - column_categories[parent_col_id] == node_t::NC_LIST) { - cuda::atomic_ref ref{ - *(list_parents_children_max_row_offsets + parent_col_id)}; - ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed); - } - }); - thrust::gather_if( - rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.end(), - parent_col_ids.begin(), - list_parents_children_max_row_offsets.begin(), - max_row_offsets.begin(), - [column_categories = column_categories.begin()] __device__(size_type parent_col_id) { - return parent_col_id != parent_node_sentinel and - column_categories[parent_col_id] == node_t::NC_LIST; - }); - } - - // copy lists' max_row_offsets to children. - // all structs should have same size. - thrust::transform_if( - rmm::exec_policy(stream), - unique_col_ids.begin(), - unique_col_ids.end(), - max_row_offsets.begin(), - [column_categories = column_categories.begin(), - is_non_list_parent, - parent_col_ids = parent_col_ids.begin(), - max_row_offsets = max_row_offsets.begin()] __device__(size_type col_id) { - auto parent_col_id = parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - while (is_non_list_parent(parent_col_id)) { - col_id = parent_col_id; - parent_col_id = parent_col_ids[parent_col_id]; - } - return max_row_offsets[col_id]; - }, - [column_categories = column_categories.begin(), - is_non_list_parent, - parent_col_ids = parent_col_ids.begin()] __device__(size_type col_id) { - auto parent_col_id = parent_col_ids[col_id]; - // condition is true if parent is not a list, or sentinel/root - return is_non_list_parent(parent_col_id); - }); - - // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) - thrust::transform_if( - rmm::exec_policy(stream), - col_range_begin.begin(), - col_range_begin.end(), - column_categories.begin(), - col_range_end.begin(), - [] __device__(auto i) { return i + 1; }, - [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); - - return std::tuple{tree_meta_t{std::move(column_categories), - std::move(parent_col_ids), - std::move(column_levels), - std::move(col_range_begin), - std::move(col_range_end)}, - std::move(unique_col_ids), - std::move(max_row_offsets)}; -} - /** * @brief Get the column indices for the values column for array of arrays rows * diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index f4ecb3b97b4..8e7275e6083 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -344,23 +344,22 @@ get_array_children_indices(TreeDepthT row_array_children_level, device_span node_levels, device_span parent_node_ids, rmm::cuda_stream_view stream); + /** * @brief Reduce node tree into column tree by aggregating each property of column. * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns */ std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - rmm::cuda_stream_view stream); - -std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, device_span original_col_ids, device_span sorted_col_ids, diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index bb3f835177f..aefeaf9a39a 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -64,11 +64,14 @@ void print(std::string str, std::vector& vec) } bool check_equality(cuio_json::tree_meta_t& d_a, + rmm::device_uvector& d_a_max_row_offsets, cuio_json::experimental::csr& d_b_csr, cuio_json::experimental::column_tree_properties& d_b_ctp, rmm::cuda_stream_view stream) { // convert from tree_meta_t to column_tree_csr + stream.synchronize(); + h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream), cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), @@ -79,6 +82,9 @@ bool check_equality(cuio_json::tree_meta_t& d_a, cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; + auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream); + auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream); + stream.synchronize(); auto num_nodes = a.parent_node_ids.size(); @@ -99,6 +105,9 @@ bool check_equality(cuio_json::tree_meta_t& d_a, for (size_t u = 0; u < num_nodes; u++) { if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } } + for (size_t u = 0; u < num_nodes; u++) { + if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; } + } return true; } @@ -173,7 +182,7 @@ TEST_F(JsonColumnTreeTests, SimpleLines) cudf::io::json::experimental::detail::reduce_to_column_tree( gpu_tree, gpu_col_id, gpu_row_offsets, false, row_array_parent_col_id, stream); - auto iseq = check_equality(d_column_tree, d_column_tree_csr, d_column_tree_properties, stream); + auto iseq = check_equality(d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream); // assert equality between csr and meta formats assert(iseq == true); } From 4339b0a1a8a9a5ceee0dc76ceeb11066a8524668 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 31 Jul 2024 05:37:15 +0000 Subject: [PATCH 23/28] formatting --- cpp/src/io/json/column_tree_construction.cu | 13 ++++++------- cpp/tests/io/json/json_tree_csr.cu | 3 ++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu index 5ef21096ac9..89a9365a60b 100644 --- a/cpp/src/io/json/column_tree_construction.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -318,8 +318,7 @@ std::tuple reduce_to_column_tree( auto num_levels = *dev_num_levels_ptr; list_ancestors[node] = node; for (int level = 0; level <= num_levels; level++) { - if (list_ancestors[node] > 0) - list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; + if (list_ancestors[node] > 0) list_ancestors[node] = colidx[rowidx[list_ancestors[node]]]; if (list_ancestors[node] == 0 || column_categories[list_ancestors[node]] == NC_LIST) break; } @@ -455,9 +454,9 @@ reduce_to_column_tree(tree_meta_t& tree, return !(parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST && (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); - return (parent_col_id != parent_node_sentinel) && - (column_categories[parent_col_id] != NC_LIST) || - (is_array_of_arrays == true && parent_col_id == row_array_parent_col_id); + return (parent_col_id != parent_node_sentinel) && + (column_categories[parent_col_id] != NC_LIST) || + (is_array_of_arrays == true && parent_col_id == row_array_parent_col_id); }; // Mixed types in List children go to different columns, @@ -546,5 +545,5 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -} // namespace detail -} // namespace cudf::io::json +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index aefeaf9a39a..bc135e041d0 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -182,7 +182,8 @@ TEST_F(JsonColumnTreeTests, SimpleLines) cudf::io::json::experimental::detail::reduce_to_column_tree( gpu_tree, gpu_col_id, gpu_row_offsets, false, row_array_parent_col_id, stream); - auto iseq = check_equality(d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream); + auto iseq = check_equality( + d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream); // assert equality between csr and meta formats assert(iseq == true); } From 9b6b7ff0ac10458de9395c84043f19005e00e8e9 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 31 Jul 2024 17:43:12 +0000 Subject: [PATCH 24/28] struct docs --- cpp/src/io/json/nested_json.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 8e7275e6083..5976160016d 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -195,6 +195,10 @@ struct csr { rmm::device_uvector colidx; }; +/* + * @brief Auxiliary column tree properties that are required to construct the device json + * column subtree, but not required for the final cudf column construction. + */ struct column_tree_properties { rmm::device_uvector categories; rmm::device_uvector max_row_offsets; From 85608eb0b6a2aa3288aa2d460559d243047b68b8 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 31 Jul 2024 18:03:47 +0000 Subject: [PATCH 25/28] cudf exports! --- cpp/src/io/json/nested_json.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 45118002f04..47d3a85e62f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -244,7 +244,7 @@ namespace detail { * @return A tuple containing the column tree, identifier for each column and the maximum row index * in each column */ - +CUDF_EXPORT std::tuple reduce_to_column_tree( tree_meta_t& tree, device_span original_col_ids, @@ -370,6 +370,7 @@ get_array_children_indices(TreeDepthT row_array_children_level, * @return A tuple of column tree representation of JSON string, column ids of columns, and * max row offsets of columns */ +CUDF_EXPORT std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, device_span original_col_ids, From 3900ee3776c3f1b02c13fee80a470c2dfc217ffe Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 2 Aug 2024 00:47:52 +0000 Subject: [PATCH 26/28] refactoring after the csr updates --- cpp/CMakeLists.txt | 1 + cpp/src/io/json/column_tree_construction.cu | 4 +- .../io/json/device_column_tree_extraction.cu | 1037 +++++++++++++++++ cpp/src/io/json/json_column.cu | 643 ---------- cpp/src/io/json/nested_json.hpp | 63 +- 5 files changed, 1092 insertions(+), 656 deletions(-) create mode 100644 cpp/src/io/json/device_column_tree_extraction.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 297b39cc25f..61d584f26be 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -395,6 +395,7 @@ add_library( src/io/json/byte_range_info.cu src/io/json/json_column.cu src/io/json/column_tree_construction.cu + src/io/json/device_column_tree_extraction.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu index 89a9365a60b..d6ebadfd9d9 100644 --- a/cpp/src/io/json/column_tree_construction.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -145,6 +145,8 @@ std::tuple reduce_to_column_tree( level_ordered_unique_node_ids.begin()); auto* dev_num_levels_ptr = thrust::max_element( rmm::exec_policy_nosync(stream), tree.node_levels.begin(), tree.node_levels.end()); + rmm::device_scalar num_levels(stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(num_levels.data(), dev_num_levels_ptr, sizeof(NodeIndexT), cudaMemcpyDeviceToDevice, stream)); rmm::device_uvector mapped_col_ids_copy(num_columns, stream); thrust::copy(rmm::exec_policy_nosync(stream), @@ -335,7 +337,7 @@ std::tuple reduce_to_column_tree( return std::tuple{ csr{std::move(rowidx), std::move(colidx)}, - column_tree_properties{ + column_tree_properties{std::move(num_levels), std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; } diff --git a/cpp/src/io/json/device_column_tree_extraction.cu b/cpp/src/io/json/device_column_tree_extraction.cu new file mode 100644 index 00000000000..c1ddeb81990 --- /dev/null +++ b/cpp/src/io/json/device_column_tree_extraction.cu @@ -0,0 +1,1037 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" +#include "nested_json.hpp" +#include "json_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::io::json { + +namespace experimental::detail { + +using row_offset_t = size_type; + +rmm::device_uvector extract_device_column_subtree( + const csr &adjacency, + const column_tree_properties &props, + cudf::io::json_reader_options reader_options, + rmm::cuda_stream_view stream) +{ + // What are the cases in which estimation works? + CUDF_EXPECTS(reader_options.is_enabled_mixed_types_as_string() == false, + "mixed type as string has not yet been implemented"); + CUDF_EXPECTS(reader_options.is_enabled_prune_columns() == false, + "column pruning has not yet been implemented"); + + auto &rowidx = adjacency.rowidx; + auto &colidx = adjacency.colidx; + auto &categories = props.categories; + auto &max_row_offsets = props.max_row_offsets; + auto &num_levels = props.num_levels; + + // Traversing the column tree and annotating the device column subtree + auto num_columns = rowidx.size() - 1; + rmm::device_uvector subtree_nrows(max_row_offsets, stream); + + // 1. removing NC_ERR nodes and their descendants i.e. + // removing the entire subtree rooted at the nodes with category NC_ERR + { + rmm::device_uvector err_ancestors(num_columns, stream); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_columns, + [rowidx = rowidx.begin(), + colidx = colidx.begin(), + num_levels_ptr = num_levels.data(), + categories = categories.begin(), + err_ancestors = err_ancestors.begin()] __device__(NodeIndexT node) { + auto num_levels = *num_levels_ptr; + err_ancestors[node] = node; + for (int level = 0; level <= num_levels; level++) { + if (err_ancestors[node] == -1 || categories[err_ancestors[node]] == NC_ERR) + break; + if (err_ancestors[node] > 0) err_ancestors[node] = colidx[rowidx[err_ancestors[node]]]; + else err_ancestors[node] = -1; + } + }); + thrust::gather_if(rmm::exec_policy_nosync(stream), + err_ancestors.begin(), + err_ancestors.end(), + err_ancestors.begin(), + thrust::make_constant_iterator(0), + subtree_nrows.begin(), + [] __device__(auto ancestor) { return ancestor != -1; }); + } + + // 2. Let's do some validation of the column tree based on its properties. + // We will be using these properties to filter nodes later on. + // =========================================================================== + // (i) Every node v is of type string, val, field name, list or struct. + // (ii) String and val cannot have any children i.e. they can only be leaf nodes + // (iii) If v is a field name, it can have struct, list, string and val as children. + // (iv) If v is a struct, it can have a field name as child + // (v) If v is a list, it can have string, val, list or struct as child + // (vi) There can only be at most one string and one val child for a given node, but many struct, + // list and field name children. + // (vii) When mixed type support is disabled - + // (a) A mix of lists and structs in the same column is not supported i.e a field name and + // list node cannot have both list and struct as children + // (b) If there is a mix of str/val + // and list/struct in the same column, then str/val is discarded + + // Validation of (vii)(a) + { + if(!reader_options.is_enabled_mixed_types_as_string()) { + auto num_field_and_list_nodes = thrust::count_if( + rmm::exec_policy_nosync(stream), categories.begin(), categories.end(), [] __device__(auto const ctg) { + return ctg == NC_FN || ctg == NC_LIST; + }); + rmm::device_uvector field_and_list_nodes(num_field_and_list_nodes, stream); + thrust::partition_copy(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + field_and_list_nodes.begin(), + thrust::make_discard_iterator(), + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_LIST || categories[node] == NC_FN; + }); + bool is_valid_tree = thrust::all_of( + rmm::exec_policy_nosync(stream), + field_and_list_nodes.begin(), + field_and_list_nodes.end(), + [rowidx = rowidx.begin(), colidx = colidx.begin(), categories = categories.begin()] __device__( + NodeIndexT node) { + NodeIndexT first_child_pos = rowidx[node] + 1; + NodeIndexT last_child_pos = rowidx[node + 1] - 1; + bool has_struct_child = false; + bool has_list_child = false; + for (NodeIndexT child_pos = first_child_pos; child_pos <= last_child_pos; child_pos++) { + if (categories[colidx[child_pos]] == NC_STRUCT) has_struct_child = true; + if (categories[colidx[child_pos]] == NC_LIST) has_list_child = true; + } + return !has_struct_child && !has_list_child; + }); + + CUDF_EXPECTS(is_valid_tree, + "Property 7a is not satisfied i.e. mix of LIST and STRUCT in same column is not " + "supported when mixed type support is disabled"); + } + } + + // Validation of (vii)(b) i.e. ignore_vals in previous implementation + // We need to identify leaf nodes that have non-leaf sibling nodes + // i.e. we need to ignore leaf nodes at level above the last level + // idea: leaf nodes have adjacency 1. So if there is an adjacency 1 inbetween non-one + // adjacencies, then found the leaf node. Corner case: consider the last set of consecutive + // ones. If the leftmost of those ones (say node u) has a non-leaf sibling + // (can be found by looking at the adjacencies of the siblings + // (which are in turn found from the colidx of the parent u), then this leaf node should be + // ignored, otherwise all good. + { + if(!reader_options.is_enabled_mixed_types_as_string()) { + // TODO: use cub segmented reduce here! + rmm::device_uvector num_adjacent_nodes( + num_columns + 1, + stream); // since adjacent_difference requires that the output have the same length as input + thrust::adjacent_difference( + rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), num_adjacent_nodes.begin()); + auto num_leaf_nodes = thrust::count_if(rmm::exec_policy_nosync(stream), + num_adjacent_nodes.begin() + 1, + num_adjacent_nodes.end(), + [] __device__(auto const adj) { return adj == 1; }); + rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); + thrust::copy_if( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + leaf_nodes.begin(), + [num_adjacent_nodes = num_adjacent_nodes.begin()] __device__(size_t node) { return num_adjacent_nodes[node] == 1; }); + + auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); + auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); + // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second + // - 1) + auto is_leftmost_leaf = thrust::mismatch( + rmm::exec_policy_nosync(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); + NodeIndexT leftmost_leaf_node = leaf_nodes.element( + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); + + // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes + // of v is non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is + // the leftmost leaf node. Otherwise, discard all children of v after and including u + auto parent_it = + thrust::upper_bound(rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), leftmost_leaf_node); + NodeIndexT parent = thrust::distance(rowidx.begin(), parent_it - 1); + NodeIndexT parent_adj_start = rowidx.element(parent, stream); + NodeIndexT parent_adj_end = rowidx.element(parent + 1, stream); + auto childnum_it = thrust::lower_bound(rmm::exec_policy_nosync(stream), + colidx.begin() + parent_adj_start, + colidx.begin() + parent_adj_end, + leftmost_leaf_node); + + auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - + thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1; + if (childnum_it != colidx.begin() + parent_adj_start + 1) { + // discarding from u to last child of parent + retained_leaf_nodes_it += thrust::distance(childnum_it, colidx.begin() + parent_adj_end); + } + // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they + // are part of ignore_vals + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_constant_iterator(0), + thrust::make_constant_iterator(0) + thrust::distance(leaf_nodes.begin(), retained_leaf_nodes_it), + leaf_nodes.begin(), + subtree_nrows.begin()); + } + } + + // (Optional?) TODO: Validation of the remaining column tree properties + + return std::move(subtree_nrows); +} + +device_column_subtree_properties allocate_device_column_subtree_properties( + device_span subtree_nrows, + const column_tree_properties &props, + rmm::cuda_stream_view stream) +{ + auto num_columns = subtree_nrows.size(); + auto &categories = props.categories; + auto &max_row_offsets = props.max_row_offsets; + + auto num_subtree_nodes = thrust::count_if(rmm::exec_policy_nosync(stream), subtree_nrows.begin(), subtree_nrows.end(), + [] __device__(auto mro) { + return mro != 0; + }); + // For the subtree, we allocate memory for device column subtree properties + rmm::device_uvector subtree_properties_map(num_subtree_nodes, stream); + thrust::copy_if(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + subtree_nrows.begin(), + subtree_properties_map.begin(), + [] __device__(auto mro) { + return mro != 0; + }); + // TODO: three way partitioning in cub::If + auto str_partitioning_idx_it = + thrust::partition(rmm::exec_policy(stream), + subtree_properties_map.begin(), + subtree_properties_map.end(), + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_STR || categories[node] == NC_VAL; + }); + auto str_val_end = thrust::distance(subtree_properties_map.begin(), str_partitioning_idx_it); + auto max_row_offsets_it = + thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()); + size_type string_offsets_size = + thrust::reduce( + rmm::exec_policy(stream), max_row_offsets_it, max_row_offsets_it + str_val_end) + + str_val_end; + rmm::device_uvector string_offsets(string_offsets_size, stream); + rmm::device_uvector string_lengths(string_offsets_size, stream); + + auto list_partitioning_idx_it = + thrust::partition(rmm::exec_policy(stream), + str_partitioning_idx_it, + subtree_properties_map.end(), + [categories = categories.begin()] __device__(NodeIndexT node) { + return categories[node] == NC_LIST; + }); + auto list_end = thrust::distance(subtree_properties_map.begin(), list_partitioning_idx_it); + max_row_offsets_it = + thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()) + + str_val_end; + size_type child_offsets_size = + thrust::reduce(rmm::exec_policy(stream), + max_row_offsets_it, + max_row_offsets_it + (list_end - str_val_end)) + + 2 * (list_end - str_val_end); + rmm::device_uvector child_offsets(child_offsets_size, stream); + + auto validity_buffer_size = + thrust::reduce(rmm::exec_policy(stream), subtree_nrows.begin(), subtree_nrows.end()); + auto validity = cudf::detail::create_null_mask(validity_buffer_size, + cudf::mask_state::ALL_NULL, + stream, + rmm::mr::get_current_device_resource()); + + return device_column_subtree_properties{ + std::move(string_offsets), + std::move(string_lengths), + std::move(child_offsets), + std::move(validity)}; +} + +void initialize_device_column_subtree_properties( + device_column_subtree_properties &d_props, + tree_meta_t &tree, + device_span original_col_ids, + device_span row_offsets, + const column_tree_properties &c_props, + rmm::cuda_stream_view stream) { + + auto num_nodes = tree.node_levels.size(); + auto num_columns = c_props.categories.size(); + // now we actually do the annotation + // relabel original_col_ids with the positions of the csr_unique_col_ids with same element. How do + // we accomplish this? one idea is to sort the row offsets by node level. Just the way we did this + // for the csr_column_ids sort original_col_ids, extract subtree based on the annotation above, + // and then initialize. + auto [sorted_node_levels, sorted_node_levels_order] = + cudf::io::json::detail::stable_sorted_key_order(tree.node_levels, stream); + auto row_offsets_it = + thrust::make_permutation_iterator(row_offsets.begin(), sorted_node_levels_order.begin()); + auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), + sorted_node_levels_order.begin()); + auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), + sorted_node_levels_order.begin()); + auto node_range_lengths_it = thrust::make_transform_iterator( + thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), + cuda::proclaim_return_type([] __device__(auto range_it) { + return thrust::get<1>(range_it) - thrust::get<0>(range_it); + })); + + auto node_col_ids_it = + thrust::make_permutation_iterator(original_col_ids.begin(), sorted_node_levels_order.begin()); + auto node_categories_it = thrust::make_permutation_iterator(tree.node_categories.begin(), + sorted_node_levels_order.begin()); + + rmm::device_uvector sorted_subtree_nrows(num_columns, stream); + thrust::sort_by_key(rmm::exec_policy_nosync(stream), + c_props.mapped_ids.begin(), + c_props.mapped_ids.end(), + sorted_subtree_nrows.begin()); + + thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(node_range_begin_it, node_range_lengths_it), + thrust::make_zip_iterator(node_range_begin_it + num_nodes, node_range_lengths_it + num_nodes), + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(d_props.string_offsets.begin(), d_props.string_lengths.begin()), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it, + node_categories_it] __device__(NodeIndexT node) { + return sorted_subtree_nrows[node_col_ids_it[node]] && + (node_categories_it[node] == NC_STR || node_categories_it[node] == NC_VAL); + }); + + // row_offsets need to be prefix summed across columns for validity initialization + thrust::replace_if( + rmm::exec_policy(stream), + row_offsets_it, + row_offsets_it + num_nodes, + thrust::make_counting_iterator(0), + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), node_col_ids_it] __device__( + NodeIndexT node) { return sorted_subtree_nrows[node_col_ids_it[node]] == 0; }, + 0); + thrust::inclusive_scan( + rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_nodes, + [sorted_subtree_nrows = sorted_subtree_nrows.begin(), + node_col_ids_it, + node_categories_it, + row_offsets_it, + validity = static_cast( + d_props.validity.data())] __device__(NodeIndexT node) { + if (sorted_subtree_nrows[node_col_ids_it[node]] && node_categories_it[node] != NC_LIST) + cudf::set_bit(validity, row_offsets_it[node]); + }); + + // scatter list offsets + +} + +} // namespace experimental::detail + +namespace detail { +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + +/** + * @brief Get the column indices for the values column for array of arrays rows + * + * @param row_array_children_level The level of the row array's children + * @param d_tree The tree metadata + * @param col_ids The column ids + * @param num_columns The number of columns + * @param stream The stream to use + * @return The value columns' indices + */ +rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, + tree_meta_t const& d_tree, + device_span col_ids, + size_type const num_columns, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [level2_nodes, level2_indices] = get_array_children_indices( + row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); + auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); + rmm::device_uvector values_column_indices(num_columns, stream); + thrust::scatter(rmm::exec_policy(stream), + level2_indices.begin(), + level2_indices.end(), + col_id_location, + values_column_indices.begin()); + return values_column_indices; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + rmm::mr::get_current_device_resource()); + auto to_host = [stream](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_std_vector_async( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto const h_offsets = cudf::detail::make_std_vector_async( + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), + stream); + stream.synchronize(); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = [&]() { + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; + }(); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + auto num_columns = d_unique_col_ids.size(); + auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream); + auto column_parent_ids = + cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream); + auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + stream.synchronize(); + // array of arrays column names + if (is_array_of_arrays) { + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_std_vector_async(values_column_indices, stream); + stream.synchronize(); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.begin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col) { + if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) { + return; + } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_categories[i] == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_categories[i]); + }; + + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + // destroy references of all child columns after this step, by calling remove_child_columns + }; + + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + std::vector is_str_column_all_nulls{}; + if (is_enabled_mixed_types_as_string) { + is_str_column_all_nulls = cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream); + } + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + + // if parent is mixed type column or this column is pruned, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + continue; + } + + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + } + + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + + if (is_enabled_mixed_types_as_string) { + // get path of this column, check if it is a struct forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and + user_dtype.value().id() == type_id::STRING) { + is_mixed_type_column[this_col_id] = 1; + column_categories[this_col_id] = NC_STR; + } + } + + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col); + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); + } + + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, rmm::mr::get_current_device_resource()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, rmm::mr::get_current_device_resource()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} + +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 2e3c5746520..f1bf43b5e85 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -98,649 +98,6 @@ void print_tree(host_span input, printf(" (JSON)\n"); } -/** - * @brief Get the column indices for the values column for array of arrays rows - * - * @param row_array_children_level The level of the row array's children - * @param d_tree The tree metadata - * @param col_ids The column ids - * @param num_columns The number of columns - * @param stream The stream to use - * @return The value columns' indices - */ -rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, - tree_meta_t const& d_tree, - device_span col_ids, - size_type const num_columns, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto [level2_nodes, level2_indices] = get_array_children_indices( - row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); - auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); - rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), - level2_indices.begin(), - level2_indices.end(), - col_id_location, - values_column_indices.begin()); - return values_column_indices; -} - -/** - * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. - * - * @param input String device buffer - * @param node_range_begin Begin offset of the strings - * @param node_range_end End offset of the strings - * @param stream CUDA stream - * @return Vector of strings - */ -std::vector copy_strings_to_host_sync( - device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto const num_strings = node_range_begin.size(); - rmm::device_uvector string_offsets(num_strings, stream); - rmm::device_uvector string_lengths(num_strings, stream); - auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), - d_offset_pairs, - d_offset_pairs + num_strings, - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), - [] __device__(auto const& offsets) { - // Note: first character for non-field columns - return thrust::make_tuple( - static_cast(thrust::get<0>(offsets)), - static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); - }); - - cudf::io::parse_options_view options_view{}; - options_view.quotechar = '\0'; // no quotes - options_view.keepquotes = true; - auto d_offset_length_it = - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); - auto d_column_names = parse_data(input.data(), - d_offset_length_it, - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - rmm::mr::get_current_device_resource()); - auto to_host = [stream](auto const& col) { - if (col.is_empty()) return std::vector{}; - auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_std_vector_async( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_std_vector_async( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - stream.synchronize(); - - // build std::string vector from chars and offsets - std::vector host_data; - host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - return host_data; - }; - return to_host(d_column_names->view()); -} - -/** - * @brief Checks if all strings in each string column in the tree are nulls. - * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as - * false. - * - * @param input Input JSON string device data - * @param d_column_tree column tree representation of JSON string - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param options Parsing options specifying the parsing behaviour - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Array of bytes where each byte indicate if it is all nulls string column. - */ -rmm::device_uvector is_all_nulls_each_column(device_span input, - tree_meta_t const& d_column_tree, - tree_meta_t const& tree, - device_span col_ids, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_cols = d_column_tree.node_categories.size(); - rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = [&]() { - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; - }(); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream); - auto column_parent_ids = - cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream); - auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - stream.synchronize(); - // array of arrays column names - if (is_array_of_arrays) { - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_std_vector_async(values_column_indices, stream); - stream.synchronize(); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.begin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col) { - if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) { - return; - } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_categories[i] == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_categories[i]); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - std::vector is_str_column_all_nulls{}; - if (is_enabled_mixed_types_as_string) { - is_str_column_all_nulls = cudf::detail::make_std_vector_sync( - is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream); - } - - // use hash map because we may skip field name's col_ids - std::unordered_map> columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { - ignore_vals[this_col_id] = 1; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = 1; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and - user_dtype.value().id() == type_id::STRING) { - is_mixed_type_column[this_col_id] = 1; - column_categories[this_col_id] = NC_STR; - } - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col); - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); - } - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, rmm::mr::get_current_device_resource()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, rmm::mr::get_current_device_resource()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; - } - }); - - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); -} - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index e42bdfe507e..1d27ef7260f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -25,6 +25,8 @@ #include #include +#include +#include #include @@ -201,6 +203,7 @@ enum class stack_behavior_t : char { constexpr auto list_child_name{"element"}; namespace experimental { +using row_offset_t = size_type; /* * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. */ @@ -214,11 +217,27 @@ struct csr { * column subtree, but not required for the final cudf column construction. */ struct column_tree_properties { + rmm::device_scalar num_levels; rmm::device_uvector categories; - rmm::device_uvector max_row_offsets; + rmm::device_uvector max_row_offsets; rmm::device_uvector mapped_ids; }; +/* + * @brief Positional and validity information for subgraph extracted from column tree. The offsets + * for each node in the member arrays can be obtained by segmented sums of the max_row_offsets array + * in column_tree_properties. + */ +struct device_column_subtree_properties { + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; + // Row offsets + rmm::device_uvector child_offsets; + // Validity bitmap + rmm::device_buffer validity; +}; + + /* * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json * column subtree - the subgraph that conforms to column tree properties - is extracted and further @@ -228,20 +247,12 @@ struct column_tree_properties { struct column_tree { // position of nnzs csr adjacency; - rmm::device_uvector rowidx; - rmm::device_uvector colidx; // device_json_column properties - using row_offset_t = size_type; // Indicator array for the device column subtree // Stores the number of rows in the column if the node is part of device column subtree // Stores zero otherwise rmm::device_uvector subtree_nrows; - rmm::device_uvector string_offsets; - rmm::device_uvector string_lengths; - // Row offsets - rmm::device_uvector child_offsets; - // Validity bitmap - rmm::device_buffer validity; + device_column_subtree_properties d_props; }; namespace detail { @@ -262,10 +273,9 @@ std::tuple reduce_to_column_tree( device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, - cudf::io::json_reader_options const& reader_options, rmm::cuda_stream_view stream); -void make_device_json_column_csr(device_span input, +void make_device_json_column(device_span input, tree_meta_t& tree, device_span col_ids, device_span row_offsets, @@ -413,6 +423,35 @@ reduce_to_column_tree(tree_meta_t& tree, cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream); +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Parses the given JSON string and generates table from the given input. * From 3949cda73c58f97d8dfda8188039604fb23cedd9 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 2 Aug 2024 01:14:23 +0000 Subject: [PATCH 27/28] minor fixes --- .../io/json/device_column_tree_extraction.cu | 23 +++++++++++-------- cpp/src/io/json/nested_json.hpp | 6 ++--- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/json/device_column_tree_extraction.cu b/cpp/src/io/json/device_column_tree_extraction.cu index c1ddeb81990..f03d11b309f 100644 --- a/cpp/src/io/json/device_column_tree_extraction.cu +++ b/cpp/src/io/json/device_column_tree_extraction.cu @@ -241,7 +241,7 @@ rmm::device_uvector extract_device_column_subtree( // (Optional?) TODO: Validation of the remaining column tree properties - return std::move(subtree_nrows); + return subtree_nrows; } device_column_subtree_properties allocate_device_column_subtree_properties( @@ -282,8 +282,8 @@ device_column_subtree_properties allocate_device_column_subtree_properties( thrust::reduce( rmm::exec_policy(stream), max_row_offsets_it, max_row_offsets_it + str_val_end) + str_val_end; - rmm::device_uvector string_offsets(string_offsets_size, stream); - rmm::device_uvector string_lengths(string_offsets_size, stream); + rmm::device_uvector string_offsets(string_offsets_size, stream); + rmm::device_uvector string_lengths(string_offsets_size, stream); auto list_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), @@ -301,7 +301,7 @@ device_column_subtree_properties allocate_device_column_subtree_properties( max_row_offsets_it, max_row_offsets_it + (list_end - str_val_end)) + 2 * (list_end - str_val_end); - rmm::device_uvector child_offsets(child_offsets_size, stream); + rmm::device_uvector child_offsets(child_offsets_size, stream); auto validity_buffer_size = thrust::reduce(rmm::exec_policy(stream), subtree_nrows.begin(), subtree_nrows.end()); @@ -319,10 +319,11 @@ device_column_subtree_properties allocate_device_column_subtree_properties( void initialize_device_column_subtree_properties( device_column_subtree_properties &d_props, + device_span subtree_nrows, tree_meta_t &tree, device_span original_col_ids, device_span row_offsets, - const column_tree_properties &c_props, + column_tree_properties &c_props, rmm::cuda_stream_view stream) { auto num_nodes = tree.node_levels.size(); @@ -342,7 +343,7 @@ void initialize_device_column_subtree_properties( sorted_node_levels_order.begin()); auto node_range_lengths_it = thrust::make_transform_iterator( thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), - cuda::proclaim_return_type([] __device__(auto range_it) { + cuda::proclaim_return_type([] __device__(auto range_it) { return thrust::get<1>(range_it) - thrust::get<0>(range_it); })); @@ -352,13 +353,14 @@ void initialize_device_column_subtree_properties( sorted_node_levels_order.begin()); rmm::device_uvector sorted_subtree_nrows(num_columns, stream); + thrust::copy(rmm::exec_policy_nosync(stream), subtree_nrows.begin(), subtree_nrows.end(), sorted_subtree_nrows.begin()); thrust::sort_by_key(rmm::exec_policy_nosync(stream), c_props.mapped_ids.begin(), c_props.mapped_ids.end(), sorted_subtree_nrows.begin()); thrust::copy_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator(node_range_begin_it, node_range_lengths_it), thrust::make_zip_iterator(node_range_begin_it + num_nodes, node_range_lengths_it + num_nodes), thrust::make_counting_iterator(0), @@ -371,8 +373,9 @@ void initialize_device_column_subtree_properties( }); // row_offsets need to be prefix summed across columns for validity initialization + // TODO: replace replace_if with a transform input iterator and pass that to inclusive scan thrust::replace_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), row_offsets_it, row_offsets_it + num_nodes, thrust::make_counting_iterator(0), @@ -380,9 +383,9 @@ void initialize_device_column_subtree_properties( NodeIndexT node) { return sorted_subtree_nrows[node_col_ids_it[node]] == 0; }, 0); thrust::inclusive_scan( - rmm::exec_policy(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); + rmm::exec_policy_nosync(stream), row_offsets_it, row_offsets_it + num_nodes, row_offsets_it); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), num_nodes, [sorted_subtree_nrows = sorted_subtree_nrows.begin(), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1d27ef7260f..36e5d63fa50 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -229,10 +229,10 @@ struct column_tree_properties { * in column_tree_properties. */ struct device_column_subtree_properties { - rmm::device_uvector string_offsets; - rmm::device_uvector string_lengths; + rmm::device_uvector string_offsets; + rmm::device_uvector string_lengths; // Row offsets - rmm::device_uvector child_offsets; + rmm::device_uvector child_offsets; // Validity bitmap rmm::device_buffer validity; }; From 4d88fe52e650c4346bc4003eb7b3f56f09d80b41 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Fri, 2 Aug 2024 01:15:37 +0000 Subject: [PATCH 28/28] formatting --- cpp/src/io/json/column_tree_construction.cu | 12 +- .../io/json/device_column_tree_extraction.cu | 216 +++++++++--------- cpp/src/io/json/nested_json.hpp | 21 +- 3 files changed, 128 insertions(+), 121 deletions(-) diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu index d6ebadfd9d9..e3c3e367361 100644 --- a/cpp/src/io/json/column_tree_construction.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -146,7 +146,8 @@ std::tuple reduce_to_column_tree( auto* dev_num_levels_ptr = thrust::max_element( rmm::exec_policy_nosync(stream), tree.node_levels.begin(), tree.node_levels.end()); rmm::device_scalar num_levels(stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(num_levels.data(), dev_num_levels_ptr, sizeof(NodeIndexT), cudaMemcpyDeviceToDevice, stream)); + CUDF_CUDA_TRY(cudaMemcpyAsync( + num_levels.data(), dev_num_levels_ptr, sizeof(NodeIndexT), cudaMemcpyDeviceToDevice, stream)); rmm::device_uvector mapped_col_ids_copy(num_columns, stream); thrust::copy(rmm::exec_policy_nosync(stream), @@ -335,10 +336,11 @@ std::tuple reduce_to_column_tree( [] __device__(auto ancestor) { return ancestor != -1; }); } - return std::tuple{ - csr{std::move(rowidx), std::move(colidx)}, - column_tree_properties{std::move(num_levels), - std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; + return std::tuple{csr{std::move(rowidx), std::move(colidx)}, + column_tree_properties{std::move(num_levels), + std::move(column_categories), + std::move(max_row_offsets), + std::move(mapped_col_ids)}}; } } // namespace experimental::detail diff --git a/cpp/src/io/json/device_column_tree_extraction.cu b/cpp/src/io/json/device_column_tree_extraction.cu index f03d11b309f..b15d327233c 100644 --- a/cpp/src/io/json/device_column_tree_extraction.cu +++ b/cpp/src/io/json/device_column_tree_extraction.cu @@ -16,8 +16,8 @@ #include "io/utilities/parsing_utils.cuh" #include "io/utilities/string_parsing.hpp" -#include "nested_json.hpp" #include "json_utils.hpp" +#include "nested_json.hpp" #include #include @@ -65,8 +65,8 @@ namespace experimental::detail { using row_offset_t = size_type; rmm::device_uvector extract_device_column_subtree( - const csr &adjacency, - const column_tree_properties &props, + const csr& adjacency, + const column_tree_properties& props, cudf::io::json_reader_options reader_options, rmm::cuda_stream_view stream) { @@ -76,11 +76,11 @@ rmm::device_uvector extract_device_column_subtree( CUDF_EXPECTS(reader_options.is_enabled_prune_columns() == false, "column pruning has not yet been implemented"); - auto &rowidx = adjacency.rowidx; - auto &colidx = adjacency.colidx; - auto &categories = props.categories; - auto &max_row_offsets = props.max_row_offsets; - auto &num_levels = props.num_levels; + auto& rowidx = adjacency.rowidx; + auto& colidx = adjacency.colidx; + auto& categories = props.categories; + auto& max_row_offsets = props.max_row_offsets; + auto& num_levels = props.num_levels; // Traversing the column tree and annotating the device column subtree auto num_columns = rowidx.size() - 1; @@ -94,27 +94,28 @@ rmm::device_uvector extract_device_column_subtree( rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), num_columns, - [rowidx = rowidx.begin(), - colidx = colidx.begin(), + [rowidx = rowidx.begin(), + colidx = colidx.begin(), num_levels_ptr = num_levels.data(), - categories = categories.begin(), - err_ancestors = err_ancestors.begin()] __device__(NodeIndexT node) { - auto num_levels = *num_levels_ptr; + categories = categories.begin(), + err_ancestors = err_ancestors.begin()] __device__(NodeIndexT node) { + auto num_levels = *num_levels_ptr; err_ancestors[node] = node; for (int level = 0; level <= num_levels; level++) { - if (err_ancestors[node] == -1 || categories[err_ancestors[node]] == NC_ERR) - break; - if (err_ancestors[node] > 0) err_ancestors[node] = colidx[rowidx[err_ancestors[node]]]; - else err_ancestors[node] = -1; + if (err_ancestors[node] == -1 || categories[err_ancestors[node]] == NC_ERR) break; + if (err_ancestors[node] > 0) + err_ancestors[node] = colidx[rowidx[err_ancestors[node]]]; + else + err_ancestors[node] = -1; } }); - thrust::gather_if(rmm::exec_policy_nosync(stream), - err_ancestors.begin(), - err_ancestors.end(), - err_ancestors.begin(), - thrust::make_constant_iterator(0), - subtree_nrows.begin(), - [] __device__(auto ancestor) { return ancestor != -1; }); + thrust::gather_if(rmm::exec_policy_nosync(stream), + err_ancestors.begin(), + err_ancestors.end(), + err_ancestors.begin(), + thrust::make_constant_iterator(0), + subtree_nrows.begin(), + [] __device__(auto ancestor) { return ancestor != -1; }); } // 2. Let's do some validation of the column tree based on its properties. @@ -126,20 +127,21 @@ rmm::device_uvector extract_device_column_subtree( // (iv) If v is a struct, it can have a field name as child // (v) If v is a list, it can have string, val, list or struct as child // (vi) There can only be at most one string and one val child for a given node, but many struct, - // list and field name children. + // list and field name children. // (vii) When mixed type support is disabled - // (a) A mix of lists and structs in the same column is not supported i.e a field name and - // list node cannot have both list and struct as children + // list node cannot have both list and struct as children // (b) If there is a mix of str/val // and list/struct in the same column, then str/val is discarded // Validation of (vii)(a) { - if(!reader_options.is_enabled_mixed_types_as_string()) { - auto num_field_and_list_nodes = thrust::count_if( - rmm::exec_policy_nosync(stream), categories.begin(), categories.end(), [] __device__(auto const ctg) { - return ctg == NC_FN || ctg == NC_LIST; - }); + if (!reader_options.is_enabled_mixed_types_as_string()) { + auto num_field_and_list_nodes = + thrust::count_if(rmm::exec_policy_nosync(stream), + categories.begin(), + categories.end(), + [] __device__(auto const ctg) { return ctg == NC_FN || ctg == NC_LIST; }); rmm::device_uvector field_and_list_nodes(num_field_and_list_nodes, stream); thrust::partition_copy(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), @@ -153,8 +155,9 @@ rmm::device_uvector extract_device_column_subtree( rmm::exec_policy_nosync(stream), field_and_list_nodes.begin(), field_and_list_nodes.end(), - [rowidx = rowidx.begin(), colidx = colidx.begin(), categories = categories.begin()] __device__( - NodeIndexT node) { + [rowidx = rowidx.begin(), + colidx = colidx.begin(), + categories = categories.begin()] __device__(NodeIndexT node) { NodeIndexT first_child_pos = rowidx[node] + 1; NodeIndexT last_child_pos = rowidx[node + 1] - 1; bool has_struct_child = false; @@ -182,11 +185,12 @@ rmm::device_uvector extract_device_column_subtree( // (which are in turn found from the colidx of the parent u), then this leaf node should be // ignored, otherwise all good. { - if(!reader_options.is_enabled_mixed_types_as_string()) { + if (!reader_options.is_enabled_mixed_types_as_string()) { // TODO: use cub segmented reduce here! rmm::device_uvector num_adjacent_nodes( num_columns + 1, - stream); // since adjacent_difference requires that the output have the same length as input + stream); // since adjacent_difference requires that the output have the same length as + // input thrust::adjacent_difference( rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), num_adjacent_nodes.begin()); auto num_leaf_nodes = thrust::count_if(rmm::exec_policy_nosync(stream), @@ -194,27 +198,32 @@ rmm::device_uvector extract_device_column_subtree( num_adjacent_nodes.end(), [] __device__(auto const adj) { return adj == 1; }); rmm::device_uvector leaf_nodes(num_leaf_nodes, stream); - thrust::copy_if( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_columns, - leaf_nodes.begin(), - [num_adjacent_nodes = num_adjacent_nodes.begin()] __device__(size_t node) { return num_adjacent_nodes[node] == 1; }); + thrust::copy_if(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + leaf_nodes.begin(), + [num_adjacent_nodes = num_adjacent_nodes.begin()] __device__(size_t node) { + return num_adjacent_nodes[node] == 1; + }); - auto rev_node_it = thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); + auto rev_node_it = + thrust::make_reverse_iterator(thrust::make_counting_iterator(0) + num_columns); auto rev_leaf_nodes_it = thrust::make_reverse_iterator(leaf_nodes.begin()); - // the node number that could be the leftmost leaf node is given by u = *(is_leftmost_leaf.second + // the node number that could be the leftmost leaf node is given by u = + // *(is_leftmost_leaf.second // - 1) - auto is_leftmost_leaf = thrust::mismatch( + auto is_leftmost_leaf = thrust::mismatch( rmm::exec_policy_nosync(stream), rev_node_it, rev_node_it + num_columns, rev_leaf_nodes_it); NodeIndexT leftmost_leaf_node = leaf_nodes.element( - num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, stream); - - // upper_bound search for u in rowidx for parent node v. Now check if any of the other child nodes - // of v is non-leaf i.e check if u is the first child of v. If yes, then leafmost_leaf_node is - // the leftmost leaf node. Otherwise, discard all children of v after and including u - auto parent_it = - thrust::upper_bound(rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), leftmost_leaf_node); + num_leaf_nodes - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1, + stream); + + // upper_bound search for u in rowidx for parent node v. Now check if any of the other child + // nodes of v is non-leaf i.e check if u is the first child of v. If yes, then + // leafmost_leaf_node is the leftmost leaf node. Otherwise, discard all children of v after + // and including u + auto parent_it = thrust::upper_bound( + rmm::exec_policy_nosync(stream), rowidx.begin(), rowidx.end(), leftmost_leaf_node); NodeIndexT parent = thrust::distance(rowidx.begin(), parent_it - 1); NodeIndexT parent_adj_start = rowidx.element(parent, stream); NodeIndexT parent_adj_end = rowidx.element(parent + 1, stream); @@ -223,17 +232,19 @@ rmm::device_uvector extract_device_column_subtree( colidx.begin() + parent_adj_end, leftmost_leaf_node); - auto retained_leaf_nodes_it = leaf_nodes.begin() + num_leaf_nodes - - thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1; + auto retained_leaf_nodes_it = + leaf_nodes.begin() + num_leaf_nodes - + thrust::distance(rev_leaf_nodes_it, is_leftmost_leaf.second - 1) - 1; if (childnum_it != colidx.begin() + parent_adj_start + 1) { // discarding from u to last child of parent retained_leaf_nodes_it += thrust::distance(childnum_it, colidx.begin() + parent_adj_end); } - // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. they - // are part of ignore_vals + // now, all nodes from leaf_nodes.begin() to retained_leaf_nodes_it need to be discarded i.e. + // they are part of ignore_vals thrust::scatter(rmm::exec_policy_nosync(stream), thrust::make_constant_iterator(0), - thrust::make_constant_iterator(0) + thrust::distance(leaf_nodes.begin(), retained_leaf_nodes_it), + thrust::make_constant_iterator(0) + + thrust::distance(leaf_nodes.begin(), retained_leaf_nodes_it), leaf_nodes.begin(), subtree_nrows.begin()); } @@ -245,28 +256,26 @@ rmm::device_uvector extract_device_column_subtree( } device_column_subtree_properties allocate_device_column_subtree_properties( - device_span subtree_nrows, - const column_tree_properties &props, - rmm::cuda_stream_view stream) + device_span subtree_nrows, + const column_tree_properties& props, + rmm::cuda_stream_view stream) { - auto num_columns = subtree_nrows.size(); - auto &categories = props.categories; - auto &max_row_offsets = props.max_row_offsets; - - auto num_subtree_nodes = thrust::count_if(rmm::exec_policy_nosync(stream), subtree_nrows.begin(), subtree_nrows.end(), - [] __device__(auto mro) { - return mro != 0; - }); + auto num_columns = subtree_nrows.size(); + auto& categories = props.categories; + auto& max_row_offsets = props.max_row_offsets; + + auto num_subtree_nodes = thrust::count_if(rmm::exec_policy_nosync(stream), + subtree_nrows.begin(), + subtree_nrows.end(), + [] __device__(auto mro) { return mro != 0; }); // For the subtree, we allocate memory for device column subtree properties rmm::device_uvector subtree_properties_map(num_subtree_nodes, stream); - thrust::copy_if(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_columns, - subtree_nrows.begin(), - subtree_properties_map.begin(), - [] __device__(auto mro) { - return mro != 0; - }); + thrust::copy_if(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_columns, + subtree_nrows.begin(), + subtree_properties_map.begin(), + [] __device__(auto mro) { return mro != 0; }); // TODO: three way partitioning in cub::If auto str_partitioning_idx_it = thrust::partition(rmm::exec_policy(stream), @@ -279,8 +288,7 @@ device_column_subtree_properties allocate_device_column_subtree_properties( auto max_row_offsets_it = thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()); size_type string_offsets_size = - thrust::reduce( - rmm::exec_policy(stream), max_row_offsets_it, max_row_offsets_it + str_val_end) + + thrust::reduce(rmm::exec_policy(stream), max_row_offsets_it, max_row_offsets_it + str_val_end) + str_val_end; rmm::device_uvector string_offsets(string_offsets_size, stream); rmm::device_uvector string_lengths(string_offsets_size, stream); @@ -297,9 +305,8 @@ device_column_subtree_properties allocate_device_column_subtree_properties( thrust::make_permutation_iterator(max_row_offsets.begin(), subtree_properties_map.begin()) + str_val_end; size_type child_offsets_size = - thrust::reduce(rmm::exec_policy(stream), - max_row_offsets_it, - max_row_offsets_it + (list_end - str_val_end)) + + thrust::reduce( + rmm::exec_policy(stream), max_row_offsets_it, max_row_offsets_it + (list_end - str_val_end)) + 2 * (list_end - str_val_end); rmm::device_uvector child_offsets(child_offsets_size, stream); @@ -310,36 +317,34 @@ device_column_subtree_properties allocate_device_column_subtree_properties( stream, rmm::mr::get_current_device_resource()); - return device_column_subtree_properties{ - std::move(string_offsets), - std::move(string_lengths), - std::move(child_offsets), - std::move(validity)}; + return device_column_subtree_properties{std::move(string_offsets), + std::move(string_lengths), + std::move(child_offsets), + std::move(validity)}; } -void initialize_device_column_subtree_properties( - device_column_subtree_properties &d_props, - device_span subtree_nrows, - tree_meta_t &tree, - device_span original_col_ids, - device_span row_offsets, - column_tree_properties &c_props, - rmm::cuda_stream_view stream) { - - auto num_nodes = tree.node_levels.size(); +void initialize_device_column_subtree_properties(device_column_subtree_properties& d_props, + device_span subtree_nrows, + tree_meta_t& tree, + device_span original_col_ids, + device_span row_offsets, + column_tree_properties& c_props, + rmm::cuda_stream_view stream) +{ + auto num_nodes = tree.node_levels.size(); auto num_columns = c_props.categories.size(); // now we actually do the annotation // relabel original_col_ids with the positions of the csr_unique_col_ids with same element. How do // we accomplish this? one idea is to sort the row offsets by node level. Just the way we did this - // for the csr_column_ids sort original_col_ids, extract subtree based on the annotation above, + // for the csr_column_ids sort original_col_ids, extract subtree based on the annotation above, // and then initialize. auto [sorted_node_levels, sorted_node_levels_order] = cudf::io::json::detail::stable_sorted_key_order(tree.node_levels, stream); auto row_offsets_it = thrust::make_permutation_iterator(row_offsets.begin(), sorted_node_levels_order.begin()); - auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), + auto node_range_begin_it = thrust::make_permutation_iterator(tree.node_range_begin.begin(), sorted_node_levels_order.begin()); - auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), + auto node_range_end_it = thrust::make_permutation_iterator(tree.node_range_end.begin(), sorted_node_levels_order.begin()); auto node_range_lengths_it = thrust::make_transform_iterator( thrust::make_zip_iterator(node_range_begin_it, node_range_end_it), @@ -353,7 +358,10 @@ void initialize_device_column_subtree_properties( sorted_node_levels_order.begin()); rmm::device_uvector sorted_subtree_nrows(num_columns, stream); - thrust::copy(rmm::exec_policy_nosync(stream), subtree_nrows.begin(), subtree_nrows.end(), sorted_subtree_nrows.begin()); + thrust::copy(rmm::exec_policy_nosync(stream), + subtree_nrows.begin(), + subtree_nrows.end(), + sorted_subtree_nrows.begin()); thrust::sort_by_key(rmm::exec_policy_nosync(stream), c_props.mapped_ids.begin(), c_props.mapped_ids.end(), @@ -392,17 +400,15 @@ void initialize_device_column_subtree_properties( node_col_ids_it, node_categories_it, row_offsets_it, - validity = static_cast( - d_props.validity.data())] __device__(NodeIndexT node) { + validity = static_cast(d_props.validity.data())] __device__(NodeIndexT node) { if (sorted_subtree_nrows[node_col_ids_it[node]] && node_categories_it[node] != NC_LIST) cudf::set_bit(validity, row_offsets_it[node]); }); // scatter list offsets - } -} // namespace experimental::detail +} // namespace experimental::detail namespace detail { /** @@ -1036,5 +1042,5 @@ void make_device_json_column(device_span input, stream.synchronize(); } -} // namespace detail -} // namespace cudf::io::json +} // namespace detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 36e5d63fa50..457a336b165 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -23,10 +23,10 @@ #include #include +#include +#include #include #include -#include -#include #include @@ -237,7 +237,6 @@ struct device_column_subtree_properties { rmm::device_buffer validity; }; - /* * @brief Unvalidated column tree stored in Compressed Sparse Row (CSR) format. The device json * column subtree - the subgraph that conforms to column tree properties - is extracted and further @@ -276,14 +275,14 @@ std::tuple reduce_to_column_tree( rmm::cuda_stream_view stream); void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace detail } // namespace experimental