rapidsai · rapids-bot · Sep 25, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 25, 2024
@@ -379,6 +379,7 @@ add_library(
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
   src/io/json/json_column.cu
+  src/io/json/column_tree_construction.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu

@@ -16,6 +16,7 @@
 
 #include "io/utilities/parsing_utils.cuh"
 #include "io/utilities/string_parsing.hpp"
+#include "json_utils.hpp"
 #include "nested_json.hpp"
 
 #include <cudf/column/column_factories.hpp>
@@ -97,205 +98,6 @@ void print_tree(host_span<SymbolT const> input,
   printf(" (JSON)\n");
 }
 
-/**
- * @brief Reduces node tree representation to column tree representation.
- *
- * @param tree Node tree representation of JSON string
- * @param original_col_ids Column ids of nodes
- * @param sorted_col_ids Sorted column ids of nodes
- * @param ordered_node_ids Node ids of nodes sorted by column ids
- * @param row_offsets Row offsets of nodes
- * @param is_array_of_arrays Whether the tree is an array of arrays
- * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A tuple of column tree representation of JSON string, column ids of columns, and
- * max row offsets of columns
- */
-std::tuple<tree_meta_t, rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-reduce_to_column_tree(tree_meta_t& tree,
-                      device_span<NodeIndexT> original_col_ids,
-                      device_span<NodeIndexT> sorted_col_ids,
-                      device_span<NodeIndexT> ordered_node_ids,
-                      device_span<size_type> row_offsets,
-                      bool is_array_of_arrays,
-                      NodeIndexT const row_array_parent_col_id,
-                      rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  // 1. column count for allocation
-  auto const num_columns =
-    thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end());
-
-  // 2. reduce_by_key {col_id}, {row_offset}, max.
-  rmm::device_uvector<NodeIndexT> unique_col_ids(num_columns, stream);
-  rmm::device_uvector<size_type> max_row_offsets(num_columns, stream);
-  auto ordered_row_offsets =
-    thrust::make_permutation_iterator(row_offsets.begin(), ordered_node_ids.begin());
-  thrust::reduce_by_key(rmm::exec_policy(stream),
-                        sorted_col_ids.begin(),
-                        sorted_col_ids.end(),
-                        ordered_row_offsets,
-                        unique_col_ids.begin(),
-                        max_row_offsets.begin(),
-                        thrust::equal_to<size_type>(),
-                        thrust::maximum<size_type>());
-
-  // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E)
-  rmm::device_uvector<NodeT> column_categories(num_columns, stream);
-  thrust::reduce_by_key(
-    rmm::exec_policy(stream),
-    sorted_col_ids.begin(),
-    sorted_col_ids.end(),
-    thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()),
-    unique_col_ids.begin(),
-    column_categories.begin(),
-    thrust::equal_to<size_type>(),
-    [] __device__(NodeT type_a, NodeT type_b) -> NodeT {
-      auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR);
-      auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR);
-      // (v+v=v, *+*=*,  *+v=*, *+#=E, NESTED+VAL=NESTED)
-      // *+*=*, v+v=v
-      if (type_a == type_b) {
-        return type_a;
-      } else if (is_a_leaf) {
-        // *+v=*, N+V=N
-        // STRUCT/LIST + STR/VAL = STRUCT/LIST, STR/VAL + FN = ERR, STR/VAL + STR = STR
-        return type_b == NC_FN ? NC_ERR : (is_b_leaf ? NC_STR : type_b);
-      } else if (is_b_leaf) {
-        return type_a == NC_FN ? NC_ERR : (is_a_leaf ? NC_STR : type_a);
-      }
-      // *+#=E
-      return NC_ERR;
-    });
-
-  // 4. unique_copy parent_node_ids, ranges
-  rmm::device_uvector<TreeDepthT> column_levels(0, stream);  // not required
-  rmm::device_uvector<NodeIndexT> parent_col_ids(num_columns, stream);
-  rmm::device_uvector<SymbolOffsetT> col_range_begin(num_columns, stream);  // Field names
-  rmm::device_uvector<SymbolOffsetT> col_range_end(num_columns, stream);
-  rmm::device_uvector<size_type> unique_node_ids(num_columns, stream);
-  thrust::unique_by_key_copy(rmm::exec_policy(stream),
-                             sorted_col_ids.begin(),
-                             sorted_col_ids.end(),
-                             ordered_node_ids.begin(),
-                             thrust::make_discard_iterator(),
-                             unique_node_ids.begin());
-  thrust::copy_n(
-    rmm::exec_policy(stream),
-    thrust::make_zip_iterator(
-      thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()),
-      thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()),
-      thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())),
-    unique_node_ids.size(),
-    thrust::make_zip_iterator(
-      parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin()));
-
-  // convert parent_node_ids to parent_col_ids
-  thrust::transform(
-    rmm::exec_policy(stream),
-    parent_col_ids.begin(),
-    parent_col_ids.end(),
-    parent_col_ids.begin(),
-    [col_ids = original_col_ids.begin()] __device__(auto parent_node_id) -> size_type {
-      return parent_node_id == parent_node_sentinel ? parent_node_sentinel
-                                                    : col_ids[parent_node_id];
-    });
-
-  // condition is true if parent is not a list, or sentinel/root
-  // Special case to return true if parent is a list and is_array_of_arrays is true
-  auto is_non_list_parent = [column_categories = column_categories.begin(),
-                             is_array_of_arrays,
-                             row_array_parent_col_id] __device__(auto parent_col_id) -> bool {
-    return !(parent_col_id == parent_node_sentinel ||
-             column_categories[parent_col_id] == NC_LIST &&
-               (!is_array_of_arrays || parent_col_id != row_array_parent_col_id));
-  };
-  // Mixed types in List children go to different columns,
-  // so all immediate children of list column should have same max_row_offsets.
-  //   create list's children max_row_offsets array. (initialize to zero)
-  //   atomicMax on  children max_row_offsets array.
-  //   gather the max_row_offsets from children row offset array.
-  {
-    rmm::device_uvector<NodeIndexT> list_parents_children_max_row_offsets(num_columns, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 list_parents_children_max_row_offsets.begin(),
-                 list_parents_children_max_row_offsets.end(),
-                 0);
-    thrust::for_each(rmm::exec_policy(stream),
-                     unique_col_ids.begin(),
-                     unique_col_ids.end(),
-                     [column_categories = column_categories.begin(),
-                      parent_col_ids    = parent_col_ids.begin(),
-                      max_row_offsets   = max_row_offsets.begin(),
-                      list_parents_children_max_row_offsets =
-                        list_parents_children_max_row_offsets.begin()] __device__(auto col_id) {
-                       auto parent_col_id = parent_col_ids[col_id];
-                       if (parent_col_id != parent_node_sentinel and
-                           column_categories[parent_col_id] == node_t::NC_LIST) {
-                         cuda::atomic_ref<NodeIndexT, cuda::thread_scope_device> ref{
-                           *(list_parents_children_max_row_offsets + parent_col_id)};
-                         ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
-                       }
-                     });
-    thrust::gather_if(
-      rmm::exec_policy(stream),
-      parent_col_ids.begin(),
-      parent_col_ids.end(),
-      parent_col_ids.begin(),
-      list_parents_children_max_row_offsets.begin(),
-      max_row_offsets.begin(),
-      [column_categories = column_categories.begin()] __device__(size_type parent_col_id) {
-        return parent_col_id != parent_node_sentinel and
-               column_categories[parent_col_id] == node_t::NC_LIST;
-      });
-  }
-
-  // copy lists' max_row_offsets to children.
-  // all structs should have same size.
-  thrust::transform_if(
-    rmm::exec_policy(stream),
-    unique_col_ids.begin(),
-    unique_col_ids.end(),
-    max_row_offsets.begin(),
-    [column_categories = column_categories.begin(),
-     is_non_list_parent,
-     parent_col_ids  = parent_col_ids.begin(),
-     max_row_offsets = max_row_offsets.begin()] __device__(size_type col_id) {
-      auto parent_col_id = parent_col_ids[col_id];
-      // condition is true if parent is not a list, or sentinel/root
-      while (is_non_list_parent(parent_col_id)) {
-        col_id        = parent_col_id;
-        parent_col_id = parent_col_ids[parent_col_id];
-      }
-      return max_row_offsets[col_id];
-    },
-    [column_categories = column_categories.begin(),
-     is_non_list_parent,
-     parent_col_ids = parent_col_ids.begin()] __device__(size_type col_id) {
-      auto parent_col_id = parent_col_ids[col_id];
-      // condition is true if parent is not a list, or sentinel/root
-      return is_non_list_parent(parent_col_id);
-    });
-
-  // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
-  thrust::transform_if(
-    rmm::exec_policy(stream),
-    col_range_begin.begin(),
-    col_range_begin.end(),
-    column_categories.begin(),
-    col_range_end.begin(),
-    [] __device__(auto i) { return i + 1; },
-    [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; });
-
-  return std::tuple{tree_meta_t{std::move(column_categories),
-                                std::move(parent_col_ids),
-                                std::move(column_levels),
-                                std::move(col_range_begin),
-                                std::move(col_range_end)},
-                    std::move(unique_col_ids),
-                    std::move(max_row_offsets)};
-}
-
 /**
  * @brief Get the column indices for the values column for array of arrays rows
  *

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
@@ -15,6 +15,7 @@
  */
 
 #include "io/utilities/hostdevice_vector.hpp"
+#include "json_utils.hpp"
 #include "nested_json.hpp"
 
 #include <cudf/detail/cuco_helpers.hpp>
@@ -33,7 +34,6 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -139,55 +139,6 @@ struct is_nested_end {
   }
 };
 
-/**
- * @brief Returns stable sorted keys and its sorted order
- *
- * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
- * Since the key and order is returned, using double buffer helps to avoid extra copy to user
- * provided output iterator.
- *
- * @tparam IndexType sorted order type
- * @tparam KeyType key type
- * @param keys keys to sort
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return Sorted keys and indices producing that sorted order
- */
-template <typename IndexType = size_t, typename KeyType>
-std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
-  cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-
-  // Determine temporary device storage requirements
-  rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
-  rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
-  rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
-  rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
-  cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
-  cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
-  size_t temp_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairs(
-    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
-  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-
-  thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
-  thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
-
-  cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
-                                  temp_storage_bytes,
-                                  keys_buffer,
-                                  order_buffer,
-                                  keys.size(),
-                                  0,
-                                  sizeof(KeyType) * 8,
-                                  stream.value());
-
-  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
-                                                                : std::move(keys_buffer2),
-                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
-                                                                  : std::move(order_buffer2)};
-}
-
 /**
  * @brief Propagate parent node from first sibling to other siblings.
  *

diff --git a/cpp/src/io/json/json_utils.hpp b/cpp/src/io/json/json_utils.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cub/device/device_radix_sort.cuh>
+#include <thrust/sequence.h>
+
+namespace cudf::io::json::detail {
+/**
+ * @brief Returns stable sorted keys and its sorted order
+ *
+ * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
+ * Since the key and order is returned, using double buffer helps to avoid extra copy to user
+ * provided output iterator.
+ *
+ * @tparam IndexType sorted order type
+ * @tparam KeyType key type
+ * @param keys keys to sort
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return Sorted keys and indices producing that sorted order
+ */
+template <typename IndexType = size_t, typename KeyType>
+std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
+  cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  // Determine temporary device storage requirements
+  rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
+  cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
+  cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+
+  thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
+  thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
+
+  cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                  temp_storage_bytes,
+                                  keys_buffer,
+                                  order_buffer,
+                                  keys.size(),
+                                  0,
+                                  sizeof(KeyType) * 8,
+                                  stream.value());
+
+  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
+                                                                : std::move(keys_buffer2),
+                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
+                                                                  : std::move(order_buffer2)};
+}
+
+}  // namespace cudf::io::json::detail