From e28017cc17d2feb050d2effd4ebafb84600fd607 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:05:12 -0500 Subject: [PATCH 01/11] Cleanup of namespaces in parquet code. (#14259) Cleans up several issues in the parquet code: - We were using the namespace `cudf::io::detail::parquet`, when `cudf::io::parquet::detail` makes more sense. - Converts the `cudf::io::parquet::gpu` namespace to also just use `cudf::io::parquet::detail` - Several detail-style headers and source files were using `cudf::io::parquet` when they should probably have been in the detail namespace. Authors: - https://github.com/nvdbaranec Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14259 --- cpp/include/cudf/io/detail/parquet.hpp | 8 +- cpp/include/cudf/io/parquet.hpp | 4 +- cpp/src/io/functions.cpp | 4 +- cpp/src/io/parquet/chunk_dict.cu | 19 +- .../io/parquet/compact_protocol_reader.cpp | 8 +- .../io/parquet/compact_protocol_reader.hpp | 9 +- .../io/parquet/compact_protocol_writer.cpp | 8 +- .../io/parquet/compact_protocol_writer.hpp | 8 +- cpp/src/io/parquet/decode_preprocess.cu | 10 +- cpp/src/io/parquet/delta_binary.cuh | 4 +- cpp/src/io/parquet/page_data.cu | 12 +- cpp/src/io/parquet/page_decode.cuh | 4 +- cpp/src/io/parquet/page_delta_decode.cu | 6 +- cpp/src/io/parquet/page_enc.cu | 22 +- cpp/src/io/parquet/page_hdr.cu | 14 +- cpp/src/io/parquet/page_string_decode.cu | 14 +- cpp/src/io/parquet/page_string_utils.cuh | 4 +- cpp/src/io/parquet/parquet.hpp | 9 +- cpp/src/io/parquet/parquet_common.hpp | 9 +- cpp/src/io/parquet/parquet_gpu.cuh | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 27 +- cpp/src/io/parquet/predicate_pushdown.cpp | 14 +- cpp/src/io/parquet/reader.cpp | 4 +- cpp/src/io/parquet/reader_impl.cpp | 36 +-- cpp/src/io/parquet/reader_impl.hpp | 12 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 121 ++++---- cpp/src/io/parquet/reader_impl_helpers.hpp | 21 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 259 +++++++++--------- cpp/src/io/parquet/rle_stream.cuh | 4 +- cpp/src/io/parquet/writer_impl.cu | 223 ++++++++------- cpp/src/io/parquet/writer_impl.hpp | 28 +- cpp/tests/io/parquet_test.cpp | 207 +++++++------- 32 files changed, 531 insertions(+), 605 deletions(-) diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 074f690d2c7..0b8ee9676de 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -38,7 +38,7 @@ class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail::parquet { +namespace parquet::detail { /** * @brief Class to read Parquet dataset data into columns. @@ -186,7 +186,7 @@ class writer { */ explicit writer(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -201,7 +201,7 @@ class writer { */ explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -250,5 +250,5 @@ class writer { * metadata. */ parquet_metadata read_parquet_metadata(host_span const> sources); -} // namespace detail::parquet +} // namespace parquet::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index deaf23d405a..6283099e700 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -499,7 +499,7 @@ class chunked_parquet_reader { [[nodiscard]] table_with_metadata read_chunk() const; private: - std::unique_ptr reader; + std::unique_ptr reader; }; /** @} */ // end of group @@ -1750,7 +1750,7 @@ class parquet_chunked_writer { std::vector const& column_chunks_file_paths = {}); /// Unique pointer to impl writer class - std::unique_ptr writer; + std::unique_ptr writer; }; /** @} */ // end of group diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 392a7850886..726442d752e 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -470,8 +470,8 @@ void orc_chunked_writer::close() writer->close(); } -using namespace cudf::io::detail::parquet; -namespace detail_parquet = cudf::io::detail::parquet; +using namespace cudf::io::parquet::detail; +namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9ff1869edde..53ff31ab0a7 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -24,10 +24,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -101,7 +99,7 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size) template __global__ void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st <<>>(chunks); } -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); @@ -290,14 +288,11 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi collect_map_entries_kernel<<>>(chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel <<>>(frags); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5c7b8ca3f8c..81d1be64a45 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -21,9 +21,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Base class for parquet field functors. @@ -870,6 +868,4 @@ int CompactProtocolReader::WalkSchema( } } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 619815db503..cbb4161b138 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata * @@ -147,6 +146,4 @@ class CompactProtocolReader { friend class parquet_field_struct_blob; }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 60bc8984d81..9adc8767880 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -16,9 +16,7 @@ #include "compact_protocol_writer.hpp" -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Parquet CompactProtocolWriter class @@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field) current_field_value = field; } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 26d66527aa5..4849a814b14 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -25,9 +25,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata @@ -115,6 +113,4 @@ class CompactProtocolFieldWriter { inline void set_current_field(int const& field); }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8de3702bc2e..544c93ee616 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 2382e4aafdf..a513e6674b4 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -18,7 +18,7 @@ #include "page_decode.cuh" -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // DELTA_XXX encoding support // @@ -291,4 +291,4 @@ struct delta_binary_decoder { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 230834632dd..cce3659b902 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector } /** - * @copydoc cudf::io::parquet::gpu::DecodePageData + * @copydoc cudf::io::parquet::detail::DecodePageData */ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index d70cabdd35f..7c866fd8b9e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -24,7 +24,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { struct page_state_s { constexpr page_state_s() noexcept {} @@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 2b78dead205..d25684a59f3 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -23,7 +23,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { namespace { @@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fe0dbb85124..78873d5e8ca 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -41,10 +41,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128) // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v) // blockDim(128, 1, 1) template __global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, + gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, device_span comp_results, @@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128) // blockDim(1024, 1, 1) __global__ void __launch_bounds__(1024) - gpuGatherPages(device_span chunks, device_span pages) + gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; __shared__ __align__(8) EncPage page_g; @@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span groups, } void InitEncoderPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan chunks, write_v2_headers); } -void EncodePages(device_span pages, +void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, @@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span pages, } void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream) { gpuGatherPages<<>>(chunks, pages); @@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span chunks, chunks, column_stats, column_index_truncate_length); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 6f8b2f50443..eae8e05e61e 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -20,10 +20,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md @@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page */ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, - gpu::ColumnChunkDesc const& chunk) +__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } @@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d79abe4a6d2..4d79770ec34 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -20,10 +20,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, } /** - * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 9395599b3ff..a81d0a64466 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -18,7 +18,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. // copies from src to dst in 16B chunks per thread. @@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 1df49262e87..c5993d73dec 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); /** @@ -405,6 +404,4 @@ static inline int CountLeadingZeros32(uint32_t value) #endif } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5a1716bb547..50736197eb9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -18,9 +18,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + // Max decimal precisions according to the parquet spec: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal auto constexpr MAX_DECIMAL32_PRECISION = 9; @@ -156,6 +155,4 @@ enum FieldType { ST_FLD_STRUCT = 12, }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index dc74bee1536..10e12ebb782 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -23,7 +23,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; @@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 51c862b376b..767668cc65e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -35,7 +35,7 @@ #include -namespace cudf::io::parquet { +namespace cudf::io::parquet::detail { using cudf::io::detail::string_index_pair; @@ -88,8 +88,6 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; -namespace gpu { - /** * @brief Enums for the flags in the page header */ @@ -347,7 +345,7 @@ struct file_intermediate_data { // all chunks from the selected row groups. We may end up reading these chunks progressively // instead of all at once - std::vector chunks{}; + std::vector chunks{}; // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we // may not be visiting every row group that contains these bounds @@ -372,16 +370,16 @@ struct pass_intermediate_data { // rowgroup, chunk and page information for the current pass. std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; rmm::device_uvector page_index{0, rmm::cuda_stream_default}; rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - std::vector output_chunk_read_info; + std::vector output_chunk_read_info; std::size_t current_output_chunk{0}; rmm::device_buffer level_decode_data{}; @@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -762,7 +760,7 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] stream CUDA stream to use */ void InitEncoderPages(cudf::detail::device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -847,7 +845,7 @@ void EncodePageHeaders(device_span pages, * @param[in] stream CUDA stream to use */ void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream); /** @@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span chunks, int32_t column_index_truncate_length, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace cudf::io::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 805d082c71e..9083be1c2dd 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -62,13 +62,13 @@ struct stats_caster { // uses storage type as T template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_FAIL("unsupported type for stats casting"); } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); return targetType(*reinterpret_cast(stats_val)); @@ -78,7 +78,7 @@ struct stats_caster { template () and !cudf::is_boolean()) or cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case INT32: return targetType(*reinterpret_cast(stats_val)); @@ -103,7 +103,7 @@ struct stats_caster { } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case FLOAT: return targetType(*reinterpret_cast(stats_val)); @@ -113,7 +113,7 @@ struct stats_caster { } template )> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case BYTE_ARRAY: [[fallthrough]]; @@ -527,4 +527,4 @@ named_to_reference_converter::visit_operands( return transformed_operands; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 1e87447006d..17d7c07bc91 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,7 +16,7 @@ #include "reader_impl.hpp" -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { reader::reader() = default; @@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); } table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index ea40f29a070..26ec83d5946 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -25,7 +25,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { @@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); size_t const sum_max_depths = std::accumulate( - chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); @@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; + auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { - gpu::ComputePageStringSizes( + ComputePageStringSizes( pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream); col_sizes = calculate_page_string_offsets(); @@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) if (has_strings) { auto& stream = streams[s_idx++]; chunk_nested_str_data.host_to_device_async(stream); - gpu::DecodeStringPageData( + DecodeStringPageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream); } // launch delta binary decoder - if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { - gpu::DecodeDeltaBinary( + if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) { + DecodeDeltaBinary( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch the catch-all page decoder - if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { - gpu::DecodePageData( + if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) { + DecodePageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } @@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + PageInfo* pi = &pages[idx]; + if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } + ColumnChunkDesc* col = &chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); - gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; + int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { @@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Save the states of the output buffers for reuse in `chunk_read()`. for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { @@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_count > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span con metadata.get_key_value_metadata()[0]}; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 9445e4d1648..6003b931b04 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { /** * @brief Implementation for Parquet reader @@ -261,10 +261,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; @@ -285,8 +285,8 @@ class reader::impl { // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; + file_intermediate_data _file_itm_data; + std::unique_ptr _pass_itm_data; // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents // the start/end of the chunks to be loaded for a given pass. @@ -301,4 +301,4 @@ class reader::impl { bool _file_preprocessed{false}; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 9778cfc47d2..171cf07da3e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -21,34 +21,34 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { ConvertedType logical_type_to_converted_type(LogicalType const& logical) { if (logical.isset.STRING) { - return parquet::UTF8; + return UTF8; } else if (logical.isset.MAP) { - return parquet::MAP; + return MAP; } else if (logical.isset.LIST) { - return parquet::LIST; + return LIST; } else if (logical.isset.ENUM) { - return parquet::ENUM; + return ENUM; } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values + return DECIMAL; // TODO set decimal values } else if (logical.isset.DATE) { - return parquet::DATE; + return DATE; } else if (logical.isset.TIME) { if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; + return TIME_MILLIS; else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; + return TIME_MICROS; } else if (logical.isset.TIMESTAMP) { if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; + return TIMESTAMP_MILLIS; else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; + return TIMESTAMP_MICROS; } else if (logical.isset.INTEGER) { switch (logical.INTEGER.bitWidth) { case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; @@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical) default: break; } } else if (logical.isset.UNKNOWN) { - return parquet::NA; + return NA; } else if (logical.isset.JSON) { - return parquet::JSON; + return JSON; } else if (logical.isset.BSON) { - return parquet::BSON; + return BSON; } - return parquet::UNKNOWN; + return UNKNOWN; } } // namespace @@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + Type const physical = schema.type; + LogicalType const logical_type = schema.logical_type; + ConvertedType converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == parquet::DECIMAL) { + if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type.DECIMAL.precision; } switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: + case UINT_8: return type_id::UINT8; + case INT_8: return type_id::INT8; + case UINT_16: return type_id::UINT16; + case INT_16: return type_id::INT16; + case UINT_32: return type_id::UINT32; + case UINT_64: return type_id::UINT64; + case DATE: return type_id::TIMESTAMP_DAYS; + case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case TIMESTAMP_MILLIS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: + case TIMESTAMP_MICROS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + case DECIMAL: + if (physical == INT32) { return type_id::DECIMAL32; } + if (physical == INT64) { return type_id::DECIMAL64; } + if (physical == FIXED_LEN_BYTE_ARRAY) { if (schema.type_length <= static_cast(sizeof(int32_t))) { return type_id::DECIMAL32; } @@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema, return type_id::DECIMAL128; } } - if (physical == parquet::BYTE_ARRAY) { + if (physical == BYTE_ARRAY) { CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return type_id::DECIMAL32; @@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema, break; // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; + case MAP: + case LIST: return type_id::LIST; + case NA: return type_id::STRING; // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support default: break; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIMESTAMP.unit.isset.NANOS) { return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIME.unit.isset.NANOS) { return type_id::DURATION_NANOSECONDS; } @@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema, // Physical storage type supported by Parquet; controls the on-disk storage // format in combination with the encoding type. switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: + case BOOLEAN: return type_id::BOOL8; + case INT32: return type_id::INT32; + case INT64: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case BYTE_ARRAY: + case FIXED_LEN_BYTE_ARRAY: // Can be mapped to INT32 (32-bit hash) or STRING return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: + case INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; default: break; @@ -420,7 +420,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple> +std::tuple> aggregate_reader_metadata::select_row_groups( host_span const> row_group_indices, int64_t skip_rows_opt, @@ -438,7 +438,7 @@ aggregate_reader_metadata::select_row_groups( host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + std::vector selection; auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -478,7 +478,7 @@ aggregate_reader_metadata::select_row_groups( } std::tuple, - std::vector, + std::vector, std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, @@ -496,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -529,7 +530,8 @@ aggregate_reader_metadata::select_columns(std::optional : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer output_col(dtype, + schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -569,7 +571,8 @@ aggregate_reader_metadata::select_columns(std::optional to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer element_col( + element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } @@ -732,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 9ee17f26a10..1a73e2f55ac 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -32,9 +32,7 @@ #include #include -namespace cudf::io::detail::parquet { - -using namespace cudf::io::parquet; +namespace cudf::io::parquet::detail { /** * @brief Function that translates Parquet datatype to cuDF type enum @@ -182,7 +180,7 @@ class aggregate_reader_metadata { * @return A tuple of corrected row_start, row_count and list of row group indexes and its * starting row */ - [[nodiscard]] std::tuple> select_row_groups( + [[nodiscard]] std::tuple> select_row_groups( host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, @@ -202,12 +200,13 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const; + [[nodiscard]] std::tuple, + std::vector, + std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; }; /** @@ -276,4 +275,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::list _operators; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index c731c467f2c..4bc6bb6f43b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -43,7 +43,8 @@ #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { + namespace { /** @@ -185,11 +186,11 @@ template */ [[nodiscard]] std::tuple conversion_info(type_id column_type_id, type_id timestamp_type_id, - parquet::Type physical, + Type physical, int8_t converted, int32_t length) { - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; int32_t clock_rate = 0; if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { type_width = 1; // I32 -> I8 @@ -202,9 +203,9 @@ template } int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + converted_type = UNKNOWN; // Not converting to float64 or decimal } return std::make_tuple(type_width, clock_rate, converted_type); } @@ -226,7 +227,7 @@ template [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, std::vector>& page_data, - cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, std::vector const& column_chunk_offsets, @@ -239,11 +240,10 @@ template size_t const io_offset = column_chunk_offsets[chunk]; size_t io_size = chunks[chunk].compressed_size; size_t next_chunk = chunk + 1; - bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED); while (next_chunk < end_chunk) { - size_t const next_offset = column_chunk_offsets[next_chunk]; - bool const is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + size_t const next_offset = column_chunk_offsets[next_chunk]; + bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED); if (next_offset != io_offset + io_size || is_next_compressed != is_compressed || chunk_source_map[chunk] != chunk_source_map[next_chunk]) { // Can't merge if not contiguous or mixing compressed and uncompressed @@ -300,13 +300,13 @@ template * * @return The total number of pages */ -[[nodiscard]] size_t count_page_headers( - cudf::detail::hostdevice_vector& chunks, rmm::cuda_stream_view stream) +[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector& chunks, + rmm::cuda_stream_view stream) { size_t total_pages = 0; chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); chunks.device_to_host_sync(stream); for (size_t c = 0; c < chunks.size(); c++) { @@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc) * @param stream CUDA stream used for device memory operations and kernel launches * @returns The size in bytes of level type data required */ -int decode_page_headers(cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, +int decode_page_headers(cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c } chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); // compute max bytes needed for level data auto level_bit_size = cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) { auto c = chunks[i]; return static_cast( - max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION])); + max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), @@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c * @return Device buffer to decompressed page data */ [[nodiscard]] rmm::device_buffer decompress_page_data( - cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, std::function const& f) { + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { @@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c size_t total_decomp_size = 0; struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; + Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; }; - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; + std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}}; auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; + if (codec == UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); @@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.num_pages++; num_comp_pages++; }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + if (codec.compression_type == BROTLI && codec.num_pages > 0) { debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); } } @@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c auto& page = pages[page_idx]; // offset will only be non-zero for V2 pages auto const offset = - page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION]; + page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION]; // for V2 need to copy def and rep level info into place, and then offset the // input and output buffers. otherwise we'd have to keep both the compressed // and decompressed data. @@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { - case parquet::GZIP: + case GZIP: gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + case SNAPPY: + if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, @@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); } break; - case parquet::ZSTD: + case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, @@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.total_decomp_size, stream); break; - case parquet::BROTLI: + case BROTLI: gpu_debrotli(d_comp_in, d_comp_out, d_comp_res_view, @@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info() }); page_nesting_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; page_nesting_decode_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; // update pointers in the PageInfos int target_page_index = 0; @@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = + PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - gpu::PageNestingDecodeInfo* nesting_info = + PageNestingDecodeInfo* nesting_info = &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space() for (size_t idx = 0; idx < pages.size(); idx++) { auto& p = pages[idx]; - p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf; + p.lvl_decode_buf[level_type::DEFINITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); - p.lvl_decode_buf[gpu::level_type::REPETITION] = buf; + p.lvl_decode_buf[level_type::REPETITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); } } @@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info() schema.converted_type, schema.type_length); - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); } remaining_rows -= row_group_rows; @@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info() void reader::impl::setup_pass() { // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); // setup row groups to be loaded for this pass auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; @@ -929,8 +926,7 @@ void reader::impl::setup_pass() auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); // adjust skip_rows and num_rows by what's available in the row groups we are processing @@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data() // Process dataset chunk pages into output columns auto const total_pages = count_page_headers(chunks, _stream); if (total_pages <= 0) { return; } - pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); @@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data() decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } + if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } } } @@ -1019,14 +1015,13 @@ struct cumulative_row_info { }; #if defined(PREPROCESS_DEBUG) -void print_pages(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view _stream) +void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view _stream) { pages.device_to_host_sync(_stream); for (size_t idx = 0; idx < pages.size(); idx++) { auto const& p = pages[idx]; // skip dictionary pages - if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } printf( "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), " "str_bytes(%d)\n", @@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector& pages, } } -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, rmm::device_uvector const& page_index, rmm::device_uvector const& c_info, rmm::cuda_stream_view stream) @@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& printf("Schema %d\n", schemas[idx]); for (size_t pidx = 0; pidx < pages.size(); pidx++) { auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { continue; } printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); @@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& } } -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) { if (splits.has_value()) { printf("------------\nSplits\n"); @@ -1093,7 +1087,7 @@ void print_cumulative_row_info( if (splits.has_value()) { // if we have a split at this row count and this is the last instance of this row count auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); auto end = start + splits->size(); auto split = std::find(start, end, sizes[idx].row_count); auto const split_index = [&]() -> int { @@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()(size_t num_rows, boo * Sums across all nesting levels. */ struct get_cumulative_row_info { - gpu::PageInfo const* const pages; + PageInfo const* const pages; __device__ cumulative_row_info operator()(size_type index) { auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return cumulative_row_info{0, 0, page.src_col_schema}; } @@ -1250,15 +1244,15 @@ struct row_total_size { * @param num_rows Total number of rows to read * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) { // now we have an array of {row_count, real output bytes}. just walk through it and generate // splits. // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; + std::vector splits; { size_t cur_pos = 0; size_t cur_cumulative_size = 0; @@ -1290,7 +1284,7 @@ std::vector find_splits(std::vector c auto const start_row = cur_row_count; cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); cur_pos = split_pos; cur_cumulative_size = sizes[split_pos].size_bytes; } @@ -1311,12 +1305,11 @@ std::vector find_splits(std::vector c * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns * @param stream CUDA stream to use */ -std::vector compute_splits( - cudf::detail::hostdevice_vector& pages, - gpu::pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) +std::vector compute_splits(cudf::detail::hostdevice_vector& pages, + pass_intermediate_data const& id, + size_t num_rows, + size_t chunk_read_limit, + rmm::cuda_stream_view stream) { auto const& page_keys = id.page_keys; auto const& page_index = id.page_index; @@ -1395,16 +1388,16 @@ std::vector compute_splits( } struct get_page_chunk_idx { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } + __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; struct get_page_num_rows { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } + __device__ size_type operator()(PageInfo const& page) { return page.num_rows; } }; struct get_page_column_index { - gpu::ColumnChunkDesc const* chunks; - __device__ size_type operator()(gpu::PageInfo const& page) + ColumnChunkDesc const* chunks; + __device__ size_type operator()(PageInfo const& page) { return chunks[page.chunk_idx].src_col_index; } @@ -1441,7 +1434,7 @@ struct get_page_nesting_size { input_col_info const* const input_cols; size_type const max_depth; size_t const num_pages; - gpu::PageInfo const* const pages; + PageInfo const* const pages; int const* page_indices; __device__ size_type operator()(size_t index) const @@ -1450,7 +1443,7 @@ struct get_page_nesting_size { auto const& page = pages[page_indices[indices.page_idx]]; if (page.src_col_schema != input_cols[indices.col_idx].schema_idx || - page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + page.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return 0; } @@ -1468,7 +1461,7 @@ struct get_reduction_key { * @brief Writes to the chunk_row field of the PageInfo struct. */ struct chunk_row_output_iter { - gpu::PageInfo* p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; using pointer = size_type*; @@ -1490,7 +1483,7 @@ struct chunk_row_output_iter { * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. */ struct start_offset_output_iterator { - gpu::PageInfo const* pages; + PageInfo const* pages; int const* page_indices; size_t cur_index; input_col_info const* input_cols; @@ -1529,9 +1522,9 @@ struct start_offset_output_iterator { { auto const indices = reduction_indices{index, max_depth, num_pages}; - gpu::PageInfo const& p = pages[page_indices[indices.page_idx]]; + PageInfo const& p = pages[page_indices[indices.page_idx]]; if (p.src_col_schema != input_cols[indices.col_idx].schema_idx || - p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + p.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return empty; } @@ -1540,15 +1533,15 @@ struct start_offset_output_iterator { }; struct flat_column_num_rows { - gpu::PageInfo const* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo const* pages; + ColumnChunkDesc const* chunks; __device__ size_type operator()(size_type pindex) const { - gpu::PageInfo const& page = pages[pindex]; + PageInfo const& page = pages[pindex]; // ignore dictionary pages and pages belonging to any column containing repetition (lists) - if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) || - (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) { + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) || + (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) { return 0; } return page.num_rows; @@ -1581,8 +1574,8 @@ struct row_counts_different { * @param expected_row_count Expected row count, if applicable * @param stream CUDA stream used for device memory operations and kernel launches */ -void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, device_span page_keys, device_span page_index, std::optional expected_row_count, @@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } struct page_to_string_size { - gpu::PageInfo* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo* pages; + ColumnChunkDesc const* chunks; __device__ size_t operator()(size_type page_idx) const { auto const page = pages[page_idx]; auto const chunk = chunks[page.chunk_idx]; - if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { - return 0; - } + if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; } return pages[page_idx].str_bytes; } }; struct page_offset_output_iter { - gpu::PageInfo* p; + PageInfo* p; size_type const* index; using value_type = size_type; @@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; break; @@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // generate string dict indices if necessary { - auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) { + auto is_dict_chunk = [](ColumnChunkDesc const& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re if (total_str_dict_indexes > 0) { chunks.host_to_device_async(_stream); - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); } } @@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // if: // - user has passed custom row bounds // - we will be doing a chunked read - gpu::ComputePageSizes(pages, - chunks, - 0, // 0-max size_t. process all possible rows - std::numeric_limits::max(), - true, // compute num_rows - chunk_read_limit > 0, // compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _pass_itm_data->level_type_size, + _stream); // computes: // PageInfo::chunk_row (the absolute start row index) for all pages @@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->output_chunk_read_info = _output_chunk_read_limit > 0 ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + : std::vector{{skip_rows, num_rows}}; } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) @@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds // is set (if the user has specified artificial bounds). if (uses_custom_row_bounds) { - gpu::ComputePageSizes(pages, - chunks, - skip_rows, - num_rows, - false, // num_rows is already computed - false, // no need to compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _pass_itm_data->level_type_size, + _stream); // print_pages(pages, _stream); } @@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; } @@ -2014,4 +2005,4 @@ std::vector reader::impl::calculate_page_string_offsets() return col_sizes; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 2545a074a38..799d6d9fd64 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -20,7 +20,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { template constexpr int rle_stream_required_run_buffer_size() @@ -362,4 +362,4 @@ struct rle_stream { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a124f352ee4..50589f23626 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -54,12 +54,9 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, @@ -185,13 +182,13 @@ namespace { * @param compression The compression type * @return The supported Parquet compression */ -parquet::Compression to_parquet_compression(compression_type compression) +Compression to_parquet_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return parquet::Compression::SNAPPY; - case compression_type::ZSTD: return parquet::Compression::ZSTD; - case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::NONE: return Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } } @@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } @@ -761,11 +758,11 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } - [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } - [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] Type physical_type() const { return schema_node.type; } + [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; } std::vector const& get_path_in_schema() { return path_in_schema; } @@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, uint16_t max_rep_level = 0; curr_schema_node = schema_node; while (curr_schema_node.parent_idx != -1) { - if (curr_schema_node.repetition_type == parquet::REPEATED or - curr_schema_node.repetition_type == parquet::OPTIONAL) { + if (curr_schema_node.repetition_type == REPEATED or + curr_schema_node.repetition_type == OPTIONAL) { ++max_def_level; } - if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; } + if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; } curr_schema_node = schema_tree[curr_schema_node.parent_idx]; } CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported"); @@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, } } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const +parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const { - auto desc = gpu::parquet_column_device_view{}; // Zero out all fields + auto desc = parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; @@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s * @param fragment_size Number of rows per fragment * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, - device_span col_desc, +void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, + device_span col_desc, host_span partitions, device_span part_frag_offset, uint32_t fragment_size, @@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector frag, +void calculate_page_fragments(device_span frag, host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( frag_sizes, stream, rmm::mr::get_current_device_resource()); - gpu::CalculatePageFragments(frag, d_frag_sz, stream); + CalculatePageFragments(frag, d_frag_sz, stream); } /** @@ -972,13 +969,13 @@ void calculate_page_fragments(device_span frag, * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, - device_span frags, + device_span frags, bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); - gpu::InitFragmentStatistics(frag_stats_group, frags, stream); + InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); @@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -auto init_page_sizes(hostdevice_2dvector& chunks, - device_span col_desc, +auto init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, @@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); // Calculate number of pages and store in respective chunks - gpu::InitEncoderPages(chunks, - {}, - {}, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); int num_pages = 0; @@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Now that we know the number of pages, allocate an array to hold per page size and get it // populated cudf::detail::hostdevice_vector page_sizes(num_pages, stream); - gpu::InitEncoderPages(chunks, - {}, - page_sizes, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + page_sizes, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); page_sizes.device_to_host_sync(stream); // Get per-page max compressed size @@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector& chunks, comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size - gpu::InitEncoderPages(chunks, - {}, - {}, - comp_page_sizes, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + comp_page_sizes, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); return comp_page_sizes; } size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) { - if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) @@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) } std::pair>, std::vector>> -build_chunk_dictionaries(hostdevice_2dvector& chunks, - host_span col_desc, - device_2dspan frags, +build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + device_2dspan frags, Compression compression, dictionary_policy dict_policy, size_t max_dict_size, @@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Allocate slots for each chunk - std::vector> hash_maps_storage; + std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || @@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); - gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(frags, stream); + initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + populate_chunk_hash_maps(frags, stream); chunks.device_to_host_sync(stream); @@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(frags, stream); + collect_map_entries(chunks.device_view().flat_view(), stream); + get_dictionary_indices(frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } @@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param write_v2_headers True if version 2 page headers are to be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_encoder_pages(hostdevice_2dvector& chunks, - device_span col_desc, - device_span pages, +void init_encoder_pages(hostdevice_2dvector& chunks, + device_span col_desc, + device_span pages, cudf::detail::hostdevice_vector& comp_page_sizes, statistics_chunk* page_stats, statistics_chunk* frag_stats, @@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector& chunks, * @param write_v2_headers True if V2 page headers should be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void encode_pages(hostdevice_2dvector& chunks, - device_span pages, +void encode_pages(hostdevice_2dvector& chunks, + device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, @@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats + first_page_in_batch, pages_in_batch) : device_span(); - uint32_t max_comp_pages = - (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { - case parquet::Compression::SNAPPY: + case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { gpu_snap(comp_in, comp_out, comp_res, stream); } else { @@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector& chunks, nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: { + case Compression::ZSTD: { if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); reason) { CUDF_FAIL("Compression error: " + reason.value()); @@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector& chunks, break; } - case parquet::Compression::UNCOMPRESSED: break; + case Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector& chunks, * @param column_index_truncate_length maximum length of min or max values in column index, in bytes * @return Computed buffer size needed to encode the column index */ -size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length) +size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length) { // encoding the column index for a given chunk requires: // each list (4 of them) requires 6 bytes of overhead @@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); // Initialize column description - cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), - stream); + cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), + stream); std::transform( parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); @@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto d_part_frag_offset = cudf::detail::make_device_uvector_async( part_frag_offset, stream, rmm::mr::get_current_device_resource()); - cudf::detail::hostdevice_2dvector row_group_fragments( + cudf::detail::hostdevice_2dvector row_group_fragments( num_columns, num_fragments, stream); // Create table_device_view so that corresponding column_device_view data @@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (num_fragments != 0) { // Move column info to device col_desc.host_to_device_async(stream); - leaf_column_views = create_leaf_column_device_views( + leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); init_row_group_fragments(row_group_fragments, @@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; - hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); + hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); // total fragments per column (in case they are non-uniform) std::vector frags_per_column(num_columns, 0); @@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group.total_byte_size = 0; row_group.columns.resize(num_columns); for (int c = 0; c < num_columns; c++) { - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; ck = {}; ck.col_desc = col_desc.device_ptr() + c; @@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, return l + r.num_values; }); ck.plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) { return sum + frag.fragment_data_size; }); auto& column_chunk_meta = row_group.columns[c].meta_data; @@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back(); rmm::device_uvector frag_stats(0, stream); - cudf::detail::hostdevice_vector page_fragments(total_frags, stream); + cudf::detail::hostdevice_vector page_fragments(total_frags, stream); // update fragments and/or prepare for fragment statistics calculation if necessary if (total_frags != 0) { @@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const& row_group = agg_meta->file(p).row_groups[global_r]; uint32_t const fragments_in_chunk = util::div_rounding_up_unsafe(row_group.num_rows, frag_size); - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; - ck.fragments = page_fragments.device_ptr(frag_offset); - ck.first_fragment = frag_offset; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + ck.fragments = page_fragments.device_ptr(frag_offset); + ck.first_fragment = frag_offset; // update the chunk pointer here for each fragment in chunk.fragments for (uint32_t i = 0; i < fragments_in_chunk; i++) { @@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->first_page = num_pages; + EncColumnChunk* ck = &chunks[r][i]; + ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; @@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } // Initialize data pointers in batch uint32_t const num_stats_bfr = @@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, stream); rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream); - rmm::device_uvector pages(num_pages, stream); + rmm::device_uvector pages(num_pages, stream); // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); @@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk& ck = chunks[r][i]; - ck.uncompressed_bfr = bfr; - ck.compressed_bfr = bfr_c; - ck.column_index_blob = bfr_i; + EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + ck.column_index_blob = bfr_i; bfr += ck.bfr_size; bfr_c += ck.compressed_size; if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { @@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); - cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); + CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); need_sync = true; } @@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector co void writer::impl::write_parquet_data_to_sink( std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - gpu::EncColumnChunk const& ck = chunks[r][i]; + EncColumnChunk const& ck = chunks[r][i]; auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index @@ -2392,7 +2388,4 @@ std::unique_ptr> writer::merge_row_group_metadata( return std::make_unique>(std::move(output)); } -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 89ef85ba2bd..1d27a8400c8 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -38,15 +38,11 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { +namespace cudf::io::parquet::detail { + // Forward internal classes struct aggregate_writer_metadata; -using namespace cudf::io::parquet; -using namespace cudf::io; using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; using cudf::detail::hostdevice_2dvector; @@ -66,7 +62,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -79,7 +75,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -139,8 +135,8 @@ class writer::impl { * @param[out] bounce_buffer Temporary host output buffer */ void write_parquet_data_to_sink(std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -164,9 +160,10 @@ class writer::impl { bool const _write_v2_headers; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. - single_write_mode const _single_write_mode; // Special parameter only used by `write()` to - // indicate that we are guaranteeing a single table - // write. This enables some internal optimizations. + cudf::io::detail::single_write_mode const + _single_write_mode; // Special parameter only used by `write()` to + // indicate that we are guaranteeing a single table + // write. This enables some internal optimizations. std::vector> const _out_sink; // Internal states, filled during `write()` and written to sink during `write` and `close()`. @@ -180,7 +177,4 @@ class writer::impl { bool _closed = false; // To track if the output has been written to sink. }; -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 73c946a5feb..3e5d7033e60 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -200,29 +200,30 @@ std::unique_ptr make_parquet_list_list_col( // of the file to populate the FileMetaData pointed to by file_meta_data. // throws cudf::logic_error if the file or metadata is invalid. void read_footer(std::unique_ptr const& source, - cudf::io::parquet::FileMetaData* file_meta_data) + cudf::io::parquet::detail::FileMetaData* file_meta_data) { - constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::file_ender_s); + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); auto const len = source->size(); auto const header_buffer = source->host_read(0, header_len); auto const header = - reinterpret_cast(header_buffer->data()); + reinterpret_cast(header_buffer->data()); auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = reinterpret_cast(ender_buffer->data()); + auto const ender = + reinterpret_cast(ender_buffer->data()); // checks for valid header, footer, and file length ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic && - ender->magic == cudf::io::parquet::parquet_magic); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" // seek backwards from the end of the file (footer_length + 8 bytes of ender) auto const footer_buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); // returns true on success bool res = cp.read(file_meta_data); @@ -233,14 +234,14 @@ void read_footer(std::unique_ptr const& source, // this assumes the data is uncompressed. // throws cudf::logic_error if the page_loc data is invalid. int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); @@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr const& source, // read column index from datasource at location indicated by chunk, // parse and return as a ColumnIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::ColumnIndex read_column_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - cudf::io::parquet::ColumnIndex colidx; + cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); bool res = cp.read(&colidx); CUDF_EXPECTS(res, "Cannot parse column index"); return colidx; @@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index( // read offset index from datasource at location indicated by chunk, // parse and return as an OffsetIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::OffsetIndex read_offset_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - cudf::io::parquet::OffsetIndex offidx; + cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); bool res = cp.read(&offidx); CUDF_EXPECTS(res, "Cannot parse offset index"); return offidx; } // Return as a Statistics from the column chunk -cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) { return chunk.meta_data.statistics; } @@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu // read page header from datasource at location indicated by page_loc, // parse and return as a PageHeader struct. // throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::PageHeader read_page_header(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); return page_hdr; @@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_EQ(ph.data_page_header.num_values, page_rows); } @@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } @@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // check that file is written correctly when rows/page < fragment size auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_TRUE(fmd.row_groups.size() > 0); @@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); // there should be only one page since the fragment size is larger than rows_per_page EXPECT_EQ(ph.data_page_header.num_values, num_rows); @@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::BoundaryOrder expected_orders[] = { - cudf::io::parquet::BoundaryOrder::ASCENDING, - cudf::io::parquet::BoundaryOrder::DESCENDING, - cudf::io::parquet::BoundaryOrder::UNORDERED}; + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); @@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2) // 1 if v1 > v2. int32_t compare_binary(std::vector const& v1, std::vector const& v2, - cudf::io::parquet::Type ptype, - cudf::io::parquet::ConvertedType ctype) + cudf::io::parquet::detail::Type ptype, + cudf::io::parquet::detail::ConvertedType ctype) { switch (ptype) { - case cudf::io::parquet::INT32: + case cudf::io::parquet::detail::INT32: switch (ctype) { - case cudf::io::parquet::UINT_8: - case cudf::io::parquet::UINT_16: - case cudf::io::parquet::UINT_32: + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); default: @@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector const& v1, *(reinterpret_cast(v2.data()))); } - case cudf::io::parquet::INT64: - if (ctype == cudf::io::parquet::UINT_64) { + case cudf::io::parquet::detail::INT64: + if (ctype == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); } return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::FLOAT: + case cudf::io::parquet::detail::FLOAT: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::DOUBLE: + case cudf::io::parquet::detail::DOUBLE: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::BYTE_ARRAY: { + case cudf::io::parquet::detail::BYTE_ARRAY: { int32_t v1sz = v1.size(); int32_t v2sz = v2.size(); int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); @@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndex) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto c0 = testdata::ascending(); @@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto validity2 = cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); @@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; @@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray) cudf::io::write_parquet(out_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); std::vector md{ {}, @@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats) auto result = cudf::io::read_parquet(in_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); @@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray) TEST_F(ParquetReaderTest, NestingOptimizationTest) { - // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep. + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info); + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); constexpr cudf::size_type rows_per_level = 2; constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; @@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest) // make sure dictionary was not used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) // make sure dictionary was used as expected. col0 should use one, // col1 should not. auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) // make sure dictionary was used for both columns auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability) TEST_P(ParquetV2Test, CheckEncodings) { - using cudf::io::parquet::Encoding; + using cudf::io::parquet::detail::Encoding; constexpr auto num_rows = 100'000; auto const is_v2 = GetParam(); @@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings) }; auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; From e345620ddaf5d8ac87e2428a84508ecfec2ba4f8 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 9 Oct 2023 10:13:25 -0700 Subject: [PATCH 02/11] Add stream parameter to List Manipulation and Operations APIs (#14248) I have organized the public List APIs into **three** distinct categories based on their functionality, simplifying the PRs for easier and shorter reviews. This particular PR introduces the `stream` parameter only to the `List Manipulation and Operations APIs`, which fall under `Section 1`. See next comment for other sections. 1. List Manipulation and Operations (`combine.hpp`, `contains.hpp`, `count_elements.hpp`) ``` concatenate_rows concatenate_list_elements contains_nulls contains - search_keys contains - search_key index_of - search_keys index_of - search_key count_elements ``` This PR addresses issues in the following files: 1. **column_wrapper.hpp**: - Corrects the improper passing of the stream value in the `make_lists_column` function. - Enables the missing cast to `lists_column_view`. - Substitutes `copy_bitmask` with `cudf::detail::copy_bitmask` to include the stream parameter. 2. **concatenate.cu:** - Substitutes `create_null_mask` with `cudf::detail::create_null_mask` to include the stream parameter. Authors: - Suraj Aralihalli (https://github.com/SurajAralihalli) - Vyas Ramasubramani (https://github.com/vyasr) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/14248 --- cpp/include/cudf/lists/combine.hpp | 4 + cpp/include/cudf/lists/contains.hpp | 14 ++- cpp/include/cudf/lists/count_elements.hpp | 2 + cpp/include/cudf_test/column_wrapper.hpp | 45 +++++++--- .../combine/concatenate_list_elements.cu | 3 +- cpp/src/lists/combine/concatenate_rows.cu | 3 +- cpp/src/lists/contains.cu | 37 ++++---- cpp/src/lists/copying/concatenate.cu | 5 +- cpp/src/lists/count_elements.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/lists_test.cpp | 87 +++++++++++++++++++ 11 files changed, 169 insertions(+), 35 deletions(-) create mode 100644 cpp/tests/streams/lists_test.cpp diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 0bc76828fc3..0d9c1c157eb 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input table. @@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,6 +88,7 @@ std::unique_ptr concatenate_rows( * @param input The lists column containing lists of list elements to concatenate. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input lists column. @@ -93,6 +96,7 @@ std::unique_ptr concatenate_rows( std::unique_ptr concatenate_list_elements( column_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index 21c2ca1d64e..7cf67ec9205 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -42,12 +42,14 @@ namespace lists { * * @param lists Lists column whose `n` rows are to be searched * @param search_key The scalar key to be looked up in each list row + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,13 +65,15 @@ std::unique_ptr contains( * 2. The list row `lists[i]` is null * * @param lists Lists column whose `n` rows are to be searched - * @param search_keys Column of elements to be looked up in each list row + * @param search_keys Column of elements to be looked up in each list row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,12 +88,14 @@ std::unique_ptr contains( * A row with an empty list will always return false. * Nulls inside non-null nested elements (such as lists or structs) are not considered. * - * @param lists Lists column whose `n` rows are to be searched + * @param lists Lists column whose `n` rows are to be searched. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains_nulls( cudf::lists_column_view const& lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t { * @param search_key The scalar key to be looked up in each list row * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -132,6 +139,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -160,6 +168,7 @@ std::unique_ptr index_of( * `lists` * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -167,6 +176,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp index 552ba058b93..e4bd0dca9ae 100644 --- a/cpp/include/cudf/lists/count_elements.hpp +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -45,11 +45,13 @@ namespace lists { * in the output column. * * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with the number of elements for each row */ std::unique_ptr count_elements( lists_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of lists_elements group diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index c0932b81dc3..e94dfea9dcf 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1281,6 +1282,11 @@ class dictionary_column_wrapper : public detail::column_wrapper { template class lists_column_wrapper : public detail::column_wrapper { public: + /** + * @brief Cast to lists_column_view + */ + operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; } + /** * @brief Construct a lists column containing a single list of fixed-width * type from an initializer list of values. @@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper { rmm::device_buffer&& null_mask) { // construct the list column - wrapped = make_lists_column( - num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + wrapped = make_lists_column(num_rows, + std::move(offsets), + std::move(values), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper { }(); // construct the list column - wrapped = make_lists_column( - cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask)); + wrapped = make_lists_column(cols.size(), + std::move(offsets), + std::move(data), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper { depth = 0; size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1; - wrapped = - make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{}); + wrapped = make_lists_column(num_elements, + std::move(offsets), + std::move(c), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); } /** @@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper { } lists_column_view lcv(col); - return make_lists_column(col.size(), - std::make_unique(lcv.offsets()), - normalize_column(lists_column_view(col).child(), - lists_column_view(expected_hierarchy).child()), - col.null_count(), - copy_bitmask(col)); + return make_lists_column( + col.size(), + std::make_unique(lcv.offsets()), + normalize_column(lists_column_view(col).child(), + lists_column_view(expected_hierarchy).child()), + col.null_count(), + cudf::detail::copy_bitmask( + col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::test::get_default_stream()); } std::pair, std::vector>> preprocess_columns( diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fbe297765f8..99dbd55678b 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -271,10 +271,11 @@ std::unique_ptr concatenate_list_elements(column_view const& input, */ std::unique_ptr concatenate_list_elements(column_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_list_elements(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 658538b0195..49be7b5ff17 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -305,10 +305,11 @@ std::unique_ptr concatenate_rows(table_view const& input, */ std::unique_ptr concatenate_rows(table_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_rows(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index df1d043bdb6..4733a5d63a8 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -287,7 +287,7 @@ std::unique_ptr index_of(lists_column_view const& lists, } auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr); - return index_of(lists, search_key_col->view(), find_option, stream, mr); + return detail::index_of(lists, search_key_col->view(), find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -306,11 +306,11 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto key_indices = index_of(lists, - search_key, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_key, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -322,11 +322,11 @@ std::unique_ptr contains(lists_column_view const& lists, CUDF_EXPECTS(search_keys.size() == lists.size(), "Number of search keys must match list column size."); - auto key_indices = index_of(lists, - search_keys, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_keys, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -364,43 +364,48 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, std::unique_ptr contains(lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_key, cudf::get_default_stream(), mr); + return detail::contains(lists, search_key, stream, mr); } std::unique_ptr contains(lists_column_view const& lists, column_view const& search_keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_keys, cudf::get_default_stream(), mr); + return detail::contains(lists, search_keys, stream, mr); } std::unique_ptr contains_nulls(lists_column_view const& lists, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_nulls(lists, cudf::get_default_stream(), mr); + return detail::contains_nulls(lists, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_key, find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, column_view const& search_keys, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_keys, find_option, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index ddd0dfbe084..5407b88236f 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -123,8 +124,8 @@ std::unique_ptr concatenate(host_span columns, // if any of the input columns have nulls, construct the output mask bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); - rmm::device_buffer null_mask = create_null_mask( - total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); + rmm::device_buffer null_mask = cudf::detail::create_null_mask( + total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr); auto null_mask_data = static_cast(null_mask.data()); auto const null_count = has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 40a14d805e1..2fd0851067a 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -73,10 +73,11 @@ std::unique_ptr count_elements(lists_column_view const& input, // external APIS std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_elements(input, cudf::get_default_stream(), mr); + return detail::count_elements(input, stream, mr); } } // namespace lists diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ac13c121530..ffaba7d6fa7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -638,6 +638,7 @@ ConfigureTest( ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp new file mode 100644 index 00000000000..e292b551d83 --- /dev/null +++ b/cpp/tests/streams/lists_test.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +class ListTest : public cudf::test::BaseFixture {}; + +TEST_F(ListTest, ConcatenateRows) +{ + cudf::test::lists_column_wrapper list_col_1{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_2{{0, 1}, {2, 3}, {4, 5}}; + cudf::table_view lists_table({list_col_1, list_col_2}); + cudf::lists::concatenate_rows( + lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ConcatenateListElements) +{ + cudf::test::lists_column_wrapper ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}; + cudf::lists::concatenate_list_elements( + ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsNulls) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::index_of(list_col, + search_key, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::index_of(list_col, + search_keys, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, CountElements) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::lists::count_elements(list_col, cudf::test::get_default_stream()); +} From b4fd77b30311f3b1de39cac22423f2c3a32ec72d Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Tue, 10 Oct 2023 12:20:42 -0500 Subject: [PATCH 03/11] Centralize chunked reading code in the parquet reader to reader_impl_chunking.cu (#14262) As a precursor to further chunked reader work, this PR centralizes chunk-related code (mostly from the `reader::impl` class) into `reader_impl_chunking.cu` and `reader_impl_chunking.hpp`. Also cleans up some variable naming and locations in `reader::impl` and the `file_intermediate_data` and `pass_intermediate_data classes`. Authors: - https://github.com/nvdbaranec Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14262 --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/parquet_gpu.hpp | 73 --- cpp/src/io/parquet/reader_impl.cpp | 12 +- cpp/src/io/parquet/reader_impl.hpp | 49 +- cpp/src/io/parquet/reader_impl_chunking.cu | 598 +++++++++++++++++++ cpp/src/io/parquet/reader_impl_chunking.hpp | 87 +++ cpp/src/io/parquet/reader_impl_helpers.hpp | 17 + cpp/src/io/parquet/reader_impl_preprocess.cu | 558 +---------------- cpp/src/io/utilities/column_buffer.cpp | 10 +- 9 files changed, 751 insertions(+), 654 deletions(-) create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu create mode 100644 cpp/src/io/parquet/reader_impl_chunking.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 000f80065ab..f8b9762f1d4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -401,6 +401,7 @@ add_library( src/io/parquet/predicate_pushdown.cpp src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp + src/io/parquet/reader_impl_chunking.cu src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 767668cc65e..6a93fec0c46 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -318,79 +318,6 @@ struct ColumnChunkDesc { int32_t src_col_schema{}; // my schema index in the file }; -/** - * @brief The row_group_info class - */ -struct row_group_info { - size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is - // called with index and source_index - size_t start_row; - size_type source_index; // file index. - - row_group_info() = default; - - row_group_info(size_type index, size_t start_row, size_type source_index) - : index{index}, start_row{start_row}, source_index{source_index} - { - } -}; - -/** - * @brief Struct to store file-level data that remains constant for - * all passes/chunks for the file. - */ -struct file_intermediate_data { - // all row groups to read - std::vector row_groups{}; - - // all chunks from the selected row groups. We may end up reading these chunks progressively - // instead of all at once - std::vector chunks{}; - - // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we - // may not be visiting every row group that contains these bounds - size_t global_skip_rows; - size_t global_num_rows; -}; - -/** - * @brief Structs to identify the reading row range for each chunk of rows in chunked reading. - */ -struct chunk_read_info { - size_t skip_rows; - size_t num_rows; -}; - -/** - * @brief Struct to store pass-level data that remains constant for a single pass. - */ -struct pass_intermediate_data { - std::vector> raw_page_data; - rmm::device_buffer decomp_page_data; - - // rowgroup, chunk and page information for the current pass. - std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; - - rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; - rmm::device_uvector page_index{0, rmm::cuda_stream_default}; - rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - - std::vector output_chunk_read_info; - std::size_t current_output_chunk{0}; - - rmm::device_buffer level_decode_data{}; - int level_type_size{0}; - - // skip_rows and num_rows values for this particular pass. these may be adjusted values from the - // global values stored in file_intermediate_data. - size_t skip_rows; - size_t num_rows; -}; - /** * @brief Struct describing an encoder column */ diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 26ec83d5946..db81222157a 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -349,14 +349,14 @@ void reader::impl::prepare_data(int64_t skip_rows, not _input_columns.empty()) { // fills in chunk information without physically loading or decompressing // the associated data - load_global_chunk_info(); + create_global_chunk_info(); // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now // we will read an entire row group at a time. However, it is possible to do // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and // changed the high level structure such that we weren't always reading an entire table's // worth of columns at once. - compute_input_pass_row_group_info(); + compute_input_passes(); } _file_preprocessed = true; @@ -364,7 +364,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // if we have to start a new pass, do that now if (!_pass_preprocessed) { - auto const num_passes = _input_pass_row_group_offsets.size() - 1; + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted @@ -373,7 +373,7 @@ void reader::impl::prepare_data(int64_t skip_rows, if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { // setup the pass_intermediate_info for this pass. - setup_pass(); + setup_next_pass(); load_and_decompress_data(); preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit); @@ -541,8 +541,8 @@ bool reader::impl::has_next() {} /*row_group_indices, empty means read all row groups*/, std::nullopt /*filter*/); - auto const num_input_passes = - _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1; + size_t const num_input_passes = std::max( + int64_t{0}, static_cast(_file_itm_data.input_pass_row_group_offsets.size()) - 1); return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) || (_current_input_pass < num_input_passes); } diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 6003b931b04..cea4ba35606 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -22,6 +22,7 @@ #pragma once #include "parquet_gpu.hpp" +#include "reader_impl_chunking.hpp" #include "reader_impl_helpers.hpp" #include @@ -136,10 +137,6 @@ class reader::impl { host_span const> row_group_indices, std::optional> filter); - void load_global_chunk_info(); - void compute_input_pass_row_group_info(); - void setup_pass(); - /** * @brief Create chunk information and start file reads * @@ -250,6 +247,31 @@ class reader::impl { */ void decode_page_data(size_t skip_rows, size_t num_rows); + /** + * @brief Creates file-wide parquet chunk information. + * + * Creates information about all chunks in the file, storing it in + * the file-wide _file_itm_data structure. + */ + void create_global_chunk_info(); + + /** + * @brief Computes all of the passes we will perform over the file. + */ + void compute_input_passes(); + + /** + * @brief Close out the existing pass (if any) and prepare for the next pass. + */ + void setup_next_pass(); + + /** + * @brief Given a set of pages that have had their sizes computed by nesting level and + * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing + * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. + */ + void compute_splits_for_pass(); + private: rmm::cuda_stream_view _stream; rmm::mr::device_memory_resource* _mr = nullptr; @@ -278,7 +300,7 @@ class reader::impl { // chunked reading happens in 2 parts: // - // At the top level there is the "pass" in which we try and limit the + // At the top level, the entire file is divided up into "passes" omn which we try and limit the // total amount of temporary memory (compressed data, decompressed data) in use // via _input_pass_read_limit. // @@ -286,19 +308,16 @@ class reader::impl { // byte size is controlled by _output_chunk_read_limit. file_intermediate_data _file_itm_data; + bool _file_preprocessed{false}; + std::unique_ptr _pass_itm_data; + bool _pass_preprocessed{false}; - // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents - // the start/end of the chunks to be loaded for a given pass. - std::vector _input_pass_row_group_offsets{}; - std::vector _input_pass_row_count{}; - std::size_t _current_input_pass{0}; - std::size_t _chunk_count{0}; + std::size_t _output_chunk_read_limit{0}; // output chunk size limit in bytes + std::size_t _input_pass_read_limit{0}; // input pass memory usage limit in bytes - std::size_t _output_chunk_read_limit{0}; - std::size_t _input_pass_read_limit{0}; - bool _pass_preprocessed{false}; - bool _file_preprocessed{false}; + std::size_t _current_input_pass{0}; // current input pass index + std::size_t _chunk_count{0}; // how many output chunks we have produced }; } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu new file mode 100644 index 00000000000..ad52a7dfcc1 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -0,0 +1,598 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" +#include "reader_impl_chunking.hpp" + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +namespace { + +struct cumulative_row_info { + size_t row_count; // cumulative row count + size_t size_bytes; // cumulative size in bytes + int key; // schema index +}; + +#if defined(CHUNKING_DEBUG) +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, + rmm::device_uvector const& page_index, + rmm::device_uvector const& c_info, + rmm::cuda_stream_view stream) +{ + pages.device_to_host_sync(stream); + + printf("------------\nCumulative sizes by page\n"); + + std::vector schemas(pages.size()); + std::vector h_page_index(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); + std::vector h_cinfo(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); + auto schema_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); + thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); + auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); + schemas.resize(last - schemas.begin()); + printf("Num schemas: %lu\n", schemas.size()); + + for (size_t idx = 0; idx < schemas.size(); idx++) { + printf("Schema %d\n", schemas[idx]); + for (size_t pidx = 0; pidx < pages.size(); pidx++) { + auto const& page = pages[h_page_index[pidx]]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + continue; + } + printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); + } + } +} + +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) +{ + if (splits.has_value()) { + printf("------------\nSplits\n"); + for (size_t idx = 0; idx < splits->size(); idx++) { + printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); + } + } + + printf("------------\nCumulative sizes %s\n", label.c_str()); + for (size_t idx = 0; idx < sizes.size(); idx++) { + printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); + if (splits.has_value()) { + // if we have a split at this row count and this is the last instance of this row count + auto start = thrust::make_transform_iterator( + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); + auto end = start + splits->size(); + auto split = std::find(start, end, sizes[idx].row_count); + auto const split_index = [&]() -> int { + if (split != end && + ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { + return static_cast(std::distance(start, split)); + } + return idx == 0 ? 0 : -1; + }(); + if (split_index >= 0) { + printf(" <-- split {%lu, %lu}", + splits.value()[split_index].skip_rows, + splits.value()[split_index].num_rows); + } + } + printf("\n"); + } +} +#endif // CHUNKING_DEBUG + +/** + * @brief Functor which reduces two cumulative_row_info structs of the same key. + */ +struct cumulative_row_sum { + cumulative_row_info operator() + __device__(cumulative_row_info const& a, cumulative_row_info const& b) const + { + return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; + } +}; + +/** + * @brief Functor which computes the total data size for a given type of cudf column. + * + * In the case of strings, the return size does not include the chars themselves. That + * information is tracked separately (see PageInfo::str_bytes). + */ +struct row_size_functor { + __device__ size_t validity_size(size_t num_rows, bool nullable) + { + return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; + } + + template + __device__ size_t operator()(size_t num_rows, bool nullable) + { + auto const element_size = sizeof(device_storage_type_t); + return (element_size * num_rows) + validity_size(num_rows, nullable); + } +}; + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + auto const offset_size = sizeof(size_type); + // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset + // for the entire column, whereas this is adding an extra offset per page. So we will get a + // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better + // to overestimate size somewhat than to underestimate it and potentially generate chunks + // that are too large. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + return validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + // only returns the size of offsets and validity. the size of the actual string chars + // is tracked separately. + auto const offset_size = sizeof(size_type); + // see note about offsets in the list_view template. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +/** + * @brief Functor which computes the total output cudf data size for all of + * the data in this page. + * + * Sums across all nesting levels. + */ +struct get_cumulative_row_info { + PageInfo const* const pages; + + __device__ cumulative_row_info operator()(size_type index) + { + auto const& page = pages[index]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { + return cumulative_row_info{0, 0, page.src_col_schema}; + } + + // total nested size, not counting string data + auto iter = + cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { + auto const& pni = page.nesting[i]; + return cudf::type_dispatcher( + data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); + }); + + size_t const row_count = static_cast(page.nesting[0].size); + return { + row_count, + thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, + page.src_col_schema}; + } +}; + +/** + * @brief Functor which computes the effective size of all input columns by page. + * + * For a given row, we want to find the cost of all pages for all columns involved + * in loading up to that row. The complication here is that not all pages are the + * same size between columns. Example: + * + * page row counts + * Column A: 0 <----> 100 <----> 200 + * Column B: 0 <---------------> 200 <--------> 400 + | + * if we decide to split at row 100, we don't really know the actual amount of bytes in column B + * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + * page. Essentially, a conservative over-estimate of the real size. + */ +struct row_total_size { + cumulative_row_info const* c_info; + size_type const* key_offsets; + size_t num_keys; + + __device__ cumulative_row_info operator()(cumulative_row_info const& i) + { + // sum sizes for each input column at this row + size_t sum = 0; + for (int idx = 0; idx < num_keys; idx++) { + auto const start = key_offsets[idx]; + auto const end = key_offsets[idx + 1]; + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&] __device__(size_type i) { return c_info[i].row_count; }); + auto const page_index = + thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; + sum += c_info[page_index].size_bytes; + } + return {i.row_count, sum, i.key}; + } +}; + +/** + * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read + * limit, determine the set of splits. + * + * @param sizes Vector of cumulative {row_count, byte_size} pairs + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + */ +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) +{ + // now we have an array of {row_count, real output bytes}. just walk through it and generate + // splits. + // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch + // sizes are reasonably large, this shouldn't iterate too many times + std::vector splits; + { + size_t cur_pos = 0; + size_t cur_cumulative_size = 0; + size_t cur_row_count = 0; + auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { + return i.size_bytes - cur_cumulative_size; + }); + auto end = start + sizes.size(); + while (cur_row_count < num_rows) { + int64_t split_pos = + thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; + + // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back + // one. + if (static_cast(split_pos) >= sizes.size() || + (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { + split_pos--; + } + + // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in + // a loop because all of the cumulative sizes for all the pages are sorted into one big list. + // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in + // the list twice. so we have to iterate until we skip past all of them. The idea is that we + // either do this, or we have to call unique() on the input first. + while (split_pos < (static_cast(sizes.size()) - 1) && + (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { + split_pos++; + } + + auto const start_row = cur_row_count; + cur_row_count = sizes[split_pos].row_count; + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); + cur_pos = split_pos; + cur_cumulative_size = sizes[split_pos].size_bytes; + } + } + // print_cumulative_row_info(sizes, "adjusted", splits); + + return splits; +} + +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ +[[nodiscard]] std::tuple conversion_info(type_id column_type_id, + type_id timestamp_type_id, + Type physical, + int8_t converted, + int32_t length) +{ + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t clock_rate = 0; + if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { + type_width = 1; // I32 -> I8 + } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { + type_width = 2; // I32 -> I16 + } else if (column_type_id == type_id::INT32) { + type_width = 4; // str -> hash32 + } else if (is_chrono(data_type{column_type_id})) { + clock_rate = to_clockrate(timestamp_type_id); + } + + int8_t converted_type = converted; + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && + not cudf::is_fixed_point(data_type{column_type_id})) { + converted_type = UNKNOWN; // Not converting to float64 or decimal + } + return std::make_tuple(type_width, clock_rate, converted_type); +} + +/** + * @brief Return the required number of bits to store a value. + */ +template +[[nodiscard]] T required_bits(uint32_t max_level) +{ + return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); +} + +struct row_count_compare { + __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) + { + return a.row_count < b.row_count; + } +}; + +} // anonymous namespace + +void reader::impl::create_global_chunk_info() +{ + auto const num_rows = _file_itm_data.global_num_rows; + auto const& row_groups_info = _file_itm_data.row_groups; + auto& chunks = _file_itm_data.chunks; + + // Descriptors for all the chunks that make up the selected columns + auto const num_input_columns = _input_columns.size(); + auto const num_chunks = row_groups_info.size() * num_input_columns; + + // Initialize column chunk information + auto remaining_rows = num_rows; + for (auto const& rg : row_groups_info) { + auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); + auto const row_group_start = rg.start_row; + auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); + + // generate ColumnChunkDesc objects for everything to be decoded (all input columns) + for (size_t i = 0; i < num_input_columns; ++i) { + auto col = _input_columns[i]; + // look up metadata + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); + + auto [type_width, clock_rate, converted_type] = + conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), + _timestamp_type.id(), + schema.type, + schema.converted_type, + schema.type_length); + + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); + } + + remaining_rows -= row_group_rows; + } +} + +void reader::impl::compute_input_passes() +{ + // at this point, row_groups has already been filtered down to just the row groups we need to + // handle optional skip_rows/num_rows parameters. + auto const& row_groups_info = _file_itm_data.row_groups; + + // if the user hasn't specified an input size limit, read everything in a single pass. + if (_input_pass_read_limit == 0) { + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + return; + } + + // generate passes. make sure to account for the case where a single row group doesn't fit within + // + std::size_t const read_limit = + _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); + std::size_t cur_pass_byte_size = 0; + std::size_t cur_rg_start = 0; + std::size_t cur_row_count = 0; + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_count.push_back(0); + + for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { + auto const& rgi = row_groups_info[cur_rg_index]; + auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); + + // can we add this row group + if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { + // A single row group (the current one) is larger than the read limit: + // We always need to include at least one row group, so end the pass at the end of the current + // row group + if (cur_rg_start == cur_rg_index) { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1); + _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows); + cur_rg_start = cur_rg_index + 1; + cur_pass_byte_size = 0; + } + // End the pass at the end of the previous row group + else { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + cur_rg_start = cur_rg_index; + cur_pass_byte_size = row_group.total_byte_size; + } + } else { + cur_pass_byte_size += row_group.total_byte_size; + } + cur_row_count += row_group.num_rows; + } + // add the last pass if necessary + if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) { + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + } +} + +void reader::impl::setup_next_pass() +{ + // this will also cause the previous pass information to be deleted + _pass_itm_data = std::make_unique(); + + // setup row groups to be loaded for this pass + auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass]; + auto const row_group_end = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1]; + auto const num_row_groups = row_group_end - row_group_start; + _pass_itm_data->row_groups.resize(num_row_groups); + std::copy(_file_itm_data.row_groups.begin() + row_group_start, + _file_itm_data.row_groups.begin() + row_group_end, + _pass_itm_data->row_groups.begin()); + + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; + CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); + + auto const chunks_per_rowgroup = _input_columns.size(); + auto const num_chunks = chunks_per_rowgroup * num_row_groups; + + auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); + auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); + + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); + std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); + + // adjust skip_rows and num_rows by what's available in the row groups we are processing + if (num_passes == 1) { + _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; + _pass_itm_data->num_rows = _file_itm_data.global_num_rows; + } else { + auto const global_start_row = _file_itm_data.global_skip_rows; + auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; + auto const start_row = + std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); + auto const end_row = + std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); + + // skip_rows is always global in the sense that it is relative to the first row of + // everything we will be reading, regardless of what pass we are on. + // num_rows is how many rows we are reading this pass. + _pass_itm_data->skip_rows = + global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; + _pass_itm_data->num_rows = end_row - start_row; + } +} + +void reader::impl::compute_splits_for_pass() +{ + auto const skip_rows = _pass_itm_data->skip_rows; + auto const num_rows = _pass_itm_data->num_rows; + + // simple case : no chunk size, no splits + if (_output_chunk_read_limit <= 0) { + _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; + return; + } + + auto& pages = _pass_itm_data->pages_info; + + auto const& page_keys = _pass_itm_data->page_keys; + auto const& page_index = _pass_itm_data->page_index; + + // generate cumulative row counts and sizes + rmm::device_uvector c_info(page_keys.size(), _stream); + // convert PageInfo to cumulative_row_info + auto page_input = thrust::make_transform_iterator(page_index.begin(), + get_cumulative_row_info{pages.device_ptr()}); + thrust::inclusive_scan_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + page_input, + c_info.begin(), + thrust::equal_to{}, + cumulative_row_sum{}); + // print_cumulative_page_info(pages, page_index, c_info, stream); + + // sort by row count + rmm::device_uvector c_info_sorted{c_info, _stream}; + thrust::sort( + rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{}); + + // std::vector h_c_info_sorted(c_info_sorted.size()); + // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), + // c_info_sorted.data(), + // sizeof(cumulative_row_info) * c_info_sorted.size(), + // cudaMemcpyDefault)); + // print_cumulative_row_info(h_c_info_sorted, "raw"); + + // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per + // key + rmm::device_uvector key_offsets(page_keys.size() + 1, _stream); + auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + key_offsets.begin()) + .second; + size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); + thrust::exclusive_scan( + rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); + + // adjust the cumulative info such that for each row count, the size includes any pages that span + // that row count. this is so that if we have this case: + // page row counts + // Column A: 0 <----> 100 <----> 200 + // Column B: 0 <---------------> 200 <--------> 400 + // | + // if we decide to split at row 100, we don't really know the actual amount of bytes in column B + // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + // page. + // + rmm::device_uvector aggregated_info(c_info.size(), _stream); + thrust::transform(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + aggregated_info.begin(), + row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); + + // bring back to the cpu + std::vector h_aggregated_info(aggregated_info.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), + aggregated_info.data(), + sizeof(cumulative_row_info) * c_info.size(), + cudaMemcpyDefault, + _stream.value())); + _stream.synchronize(); + + // generate the actual splits + _pass_itm_data->output_chunk_read_info = + find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp new file mode 100644 index 00000000000..dfc239d8451 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "reader_impl_helpers.hpp" + +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Struct to store file-level data that remains constant for + * all passes/chunks in the file. + */ +struct file_intermediate_data { + // all row groups to read + std::vector row_groups{}; + + // all chunks from the selected row groups. We may end up reading these chunks progressively + // instead of all at once + std::vector chunks{}; + + // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents + // the start/end of the chunks to be loaded for a given pass. + std::vector input_pass_row_group_offsets{}; + // row counts per input-pass + std::vector input_pass_row_count{}; + + // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we + // may not be visiting every row group that contains these bounds + size_t global_skip_rows; + size_t global_num_rows; +}; + +/** + * @brief Struct to identify the range for each chunk of rows during a chunked reading pass. + */ +struct chunk_read_info { + size_t skip_rows; + size_t num_rows; +}; + +/** + * @brief Struct to store pass-level data that remains constant for a single pass. + */ +struct pass_intermediate_data { + std::vector> raw_page_data; + rmm::device_buffer decomp_page_data; + + // rowgroup, chunk and page information for the current pass. + std::vector row_groups{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; + + rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; + rmm::device_uvector page_index{0, rmm::cuda_stream_default}; + rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; + + std::vector output_chunk_read_info; + std::size_t current_output_chunk{0}; + + rmm::device_buffer level_decode_data{}; + int level_type_size{0}; + + // skip_rows and num_rows values for this particular pass. these may be adjusted values from the + // global values stored in file_intermediate_data. + size_t skip_rows; + size_t num_rows; +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 1a73e2f55ac..8d8ab8707be 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -34,6 +34,23 @@ namespace cudf::io::parquet::detail { +/** + * @brief The row_group_info class + */ +struct row_group_info { + size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is + // called with index and source_index + size_t start_row; + size_type source_index; // file index. + + row_group_info() = default; + + row_group_info(size_type index, size_t start_row, size_type source_index) + : index{index}, start_row{start_row}, source_index{source_index} + { + } +}; + /** * @brief Function that translates Parquet datatype to cuDF type enum */ diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 4bc6bb6f43b..ce45f709ee1 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -44,7 +43,6 @@ #include namespace cudf::io::parquet::detail { - namespace { /** @@ -170,46 +168,6 @@ void generate_depth_remappings(std::map, std::ve } } -/** - * @brief Return the required number of bits to store a value. - */ -template -[[nodiscard]] T required_bits(uint32_t max_level) -{ - return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); -} - -/** - * @brief Converts cuDF units to Parquet units. - * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. - */ -[[nodiscard]] std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - Type physical, - int8_t converted, - int32_t length) -{ - int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); - } - - int8_t converted_type = converted; - if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); -} - /** * @brief Reads compressed page data to device memory. * @@ -790,163 +748,6 @@ std::pair>> reader::impl::read_and_decompres return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; } -void reader::impl::load_global_chunk_info() -{ - auto const num_rows = _file_itm_data.global_num_rows; - auto const& row_groups_info = _file_itm_data.row_groups; - auto& chunks = _file_itm_data.chunks; - - // Descriptors for all the chunks that make up the selected columns - auto const num_input_columns = _input_columns.size(); - auto const num_chunks = row_groups_info.size() * num_input_columns; - - // Initialize column chunk information - auto remaining_rows = num_rows; - for (auto const& rg : row_groups_info) { - auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); - auto const row_group_start = rg.start_row; - auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); - - // generate ColumnChunkDesc objects for everything to be decoded (all input columns) - for (size_t i = 0; i < num_input_columns; ++i) { - auto col = _input_columns[i]; - // look up metadata - auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); - - auto [type_width, clock_rate, converted_type] = - conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), - _timestamp_type.id(), - schema.type, - schema.converted_type, - schema.type_length); - - chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); - } - - remaining_rows -= row_group_rows; - } -} - -void reader::impl::compute_input_pass_row_group_info() -{ - // at this point, row_groups has already been filtered down to just the row groups we need to - // handle optional skip_rows/num_rows parameters. - auto const& row_groups_info = _file_itm_data.row_groups; - - // if the user hasn't specified an input size limit, read everything in a single pass. - if (_input_pass_read_limit == 0) { - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - return; - } - - // generate passes. make sure to account for the case where a single row group doesn't fit within - // - std::size_t const read_limit = - _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); - std::size_t cur_pass_byte_size = 0; - std::size_t cur_rg_start = 0; - std::size_t cur_row_count = 0; - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_count.push_back(0); - - for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { - auto const& rgi = row_groups_info[cur_rg_index]; - auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); - - // can we add this row group - if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { - // A single row group (the current one) is larger than the read limit: - // We always need to include at least one row group, so end the pass at the end of the current - // row group - if (cur_rg_start == cur_rg_index) { - _input_pass_row_group_offsets.push_back(cur_rg_index + 1); - _input_pass_row_count.push_back(cur_row_count + row_group.num_rows); - cur_rg_start = cur_rg_index + 1; - cur_pass_byte_size = 0; - } - // End the pass at the end of the previous row group - else { - _input_pass_row_group_offsets.push_back(cur_rg_index); - _input_pass_row_count.push_back(cur_row_count); - cur_rg_start = cur_rg_index; - cur_pass_byte_size = row_group.total_byte_size; - } - } else { - cur_pass_byte_size += row_group.total_byte_size; - } - cur_row_count += row_group.num_rows; - } - // add the last pass if necessary - if (_input_pass_row_group_offsets.back() != row_groups_info.size()) { - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - _input_pass_row_count.push_back(cur_row_count); - } -} - -void reader::impl::setup_pass() -{ - // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); - - // setup row groups to be loaded for this pass - auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; - auto const row_group_end = _input_pass_row_group_offsets[_current_input_pass + 1]; - auto const num_row_groups = row_group_end - row_group_start; - _pass_itm_data->row_groups.resize(num_row_groups); - std::copy(_file_itm_data.row_groups.begin() + row_group_start, - _file_itm_data.row_groups.begin() + row_group_end, - _pass_itm_data->row_groups.begin()); - - auto const num_passes = _input_pass_row_group_offsets.size() - 1; - CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); - - auto const chunks_per_rowgroup = _input_columns.size(); - auto const num_chunks = chunks_per_rowgroup * num_row_groups; - - auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); - auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - - _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); - std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); - - // adjust skip_rows and num_rows by what's available in the row groups we are processing - if (num_passes == 1) { - _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; - _pass_itm_data->num_rows = _file_itm_data.global_num_rows; - } else { - auto const global_start_row = _file_itm_data.global_skip_rows; - auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; - auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row); - auto const end_row = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row); - - // skip_rows is always global in the sense that it is relative to the first row of - // everything we will be reading, regardless of what pass we are on. - // num_rows is how many rows we are reading this pass. - _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass]; - _pass_itm_data->num_rows = end_row - start_row; - } -} - void reader::impl::load_and_decompress_data() { // This function should never be called if `num_rows == 0`. @@ -1034,359 +835,8 @@ void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_str p.str_bytes); } } - -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, - rmm::device_uvector const& page_index, - rmm::device_uvector const& c_info, - rmm::cuda_stream_view stream) -{ - pages.device_to_host_sync(stream); - - printf("------------\nCumulative sizes by page\n"); - - std::vector schemas(pages.size()); - std::vector h_page_index(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); - std::vector h_cinfo(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); - auto schema_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); - thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); - auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); - schemas.resize(last - schemas.begin()); - printf("Num schemas: %lu\n", schemas.size()); - - for (size_t idx = 0; idx < schemas.size(); idx++) { - printf("Schema %d\n", schemas[idx]); - for (size_t pidx = 0; pidx < pages.size(); pidx++) { - auto const& page = pages[h_page_index[pidx]]; - if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { - continue; - } - printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); - } - } -} - -void print_cumulative_row_info(host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) -{ - if (splits.has_value()) { - printf("------------\nSplits\n"); - for (size_t idx = 0; idx < splits->size(); idx++) { - printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); - } - } - - printf("------------\nCumulative sizes %s\n", label.c_str()); - for (size_t idx = 0; idx < sizes.size(); idx++) { - printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); - if (splits.has_value()) { - // if we have a split at this row count and this is the last instance of this row count - auto start = thrust::make_transform_iterator( - splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); - auto end = start + splits->size(); - auto split = std::find(start, end, sizes[idx].row_count); - auto const split_index = [&]() -> int { - if (split != end && - ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { - return static_cast(std::distance(start, split)); - } - return idx == 0 ? 0 : -1; - }(); - if (split_index >= 0) { - printf(" <-- split {%lu, %lu}", - splits.value()[split_index].skip_rows, - splits.value()[split_index].num_rows); - } - } - printf("\n"); - } -} #endif // PREPROCESS_DEBUG -/** - * @brief Functor which reduces two cumulative_row_info structs of the same key. - */ -struct cumulative_row_sum { - cumulative_row_info operator() - __device__(cumulative_row_info const& a, cumulative_row_info const& b) const - { - return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; - } -}; - -/** - * @brief Functor which computes the total data size for a given type of cudf column. - * - * In the case of strings, the return size does not include the chars themselves. That - * information is tracked separately (see PageInfo::str_bytes). - */ -struct row_size_functor { - __device__ size_t validity_size(size_t num_rows, bool nullable) - { - return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; - } - - template - __device__ size_t operator()(size_t num_rows, bool nullable) - { - auto const element_size = sizeof(device_storage_type_t); - return (element_size * num_rows) + validity_size(num_rows, nullable); - } -}; - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - auto const offset_size = sizeof(size_type); - // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset - // for the entire column, whereas this is adding an extra offset per page. So we will get a - // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better - // to overestimate size somewhat than to underestimate it and potentially generate chunks - // that are too large. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - return validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - // only returns the size of offsets and validity. the size of the actual string chars - // is tracked separately. - auto const offset_size = sizeof(size_type); - // see note about offsets in the list_view template. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -/** - * @brief Functor which computes the total output cudf data size for all of - * the data in this page. - * - * Sums across all nesting levels. - */ -struct get_cumulative_row_info { - PageInfo const* const pages; - - __device__ cumulative_row_info operator()(size_type index) - { - auto const& page = pages[index]; - if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { - return cumulative_row_info{0, 0, page.src_col_schema}; - } - - // total nested size, not counting string data - auto iter = - cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { - auto const& pni = page.nesting[i]; - return cudf::type_dispatcher( - data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); - }); - - size_t const row_count = static_cast(page.nesting[0].size); - return { - row_count, - thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, - page.src_col_schema}; - } -}; - -/** - * @brief Functor which computes the effective size of all input columns by page. - * - * For a given row, we want to find the cost of all pages for all columns involved - * in loading up to that row. The complication here is that not all pages are the - * same size between columns. Example: - * - * page row counts - * Column A: 0 <----> 100 <----> 200 - * Column B: 0 <---------------> 200 <--------> 400 - | - * if we decide to split at row 100, we don't really know the actual amount of bytes in column B - * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - * page. Essentially, a conservative over-estimate of the real size. - */ -struct row_total_size { - cumulative_row_info const* c_info; - size_type const* key_offsets; - size_t num_keys; - - __device__ cumulative_row_info operator()(cumulative_row_info const& i) - { - // sum sizes for each input column at this row - size_t sum = 0; - for (int idx = 0; idx < num_keys; idx++) { - auto const start = key_offsets[idx]; - auto const end = key_offsets[idx + 1]; - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&] __device__(size_type i) { return c_info[i].row_count; }); - auto const page_index = - thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; - sum += c_info[page_index].size_bytes; - } - return {i.row_count, sum, i.key}; - } -}; - -/** - * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read - * limit, determine the set of splits. - * - * @param sizes Vector of cumulative {row_count, byte_size} pairs - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) -{ - // now we have an array of {row_count, real output bytes}. just walk through it and generate - // splits. - // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch - // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; - { - size_t cur_pos = 0; - size_t cur_cumulative_size = 0; - size_t cur_row_count = 0; - auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { - return i.size_bytes - cur_cumulative_size; - }); - auto end = start + sizes.size(); - while (cur_row_count < num_rows) { - int64_t split_pos = - thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; - - // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back - // one. - if (static_cast(split_pos) >= sizes.size() || - (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { - split_pos--; - } - - // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in - // a loop because all of the cumulative sizes for all the pages are sorted into one big list. - // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in - // the list twice. so we have to iterate until we skip past all of them. The idea is that we - // either do this, or we have to call unique() on the input first. - while (split_pos < (static_cast(sizes.size()) - 1) && - (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { - split_pos++; - } - - auto const start_row = cur_row_count; - cur_row_count = sizes[split_pos].row_count; - splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); - cur_pos = split_pos; - cur_cumulative_size = sizes[split_pos].size_bytes; - } - } - // print_cumulative_row_info(sizes, "adjusted", splits); - - return splits; -} - -/** - * @brief Given a set of pages that have had their sizes computed by nesting level and - * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing - * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. - * - * @param pages All pages in the file - * @param id Additional intermediate information required to process the pages - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - * @param stream CUDA stream to use - */ -std::vector compute_splits(cudf::detail::hostdevice_vector& pages, - pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) -{ - auto const& page_keys = id.page_keys; - auto const& page_index = id.page_index; - - // generate cumulative row counts and sizes - rmm::device_uvector c_info(page_keys.size(), stream); - // convert PageInfo to cumulative_row_info - auto page_input = thrust::make_transform_iterator(page_index.begin(), - get_cumulative_row_info{pages.device_ptr()}); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - page_input, - c_info.begin(), - thrust::equal_to{}, - cumulative_row_sum{}); - // print_cumulative_page_info(pages, page_index, c_info, stream); - - // sort by row count - rmm::device_uvector c_info_sorted{c_info, stream}; - thrust::sort(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) { - return a.row_count < b.row_count; - }); - - // std::vector h_c_info_sorted(c_info_sorted.size()); - // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), - // c_info_sorted.data(), - // sizeof(cumulative_row_info) * c_info_sorted.size(), - // cudaMemcpyDefault)); - // print_cumulative_row_info(h_c_info_sorted, "raw"); - - // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per - // key - rmm::device_uvector key_offsets(page_keys.size() + 1, stream); - auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - key_offsets.begin()) - .second; - size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); - thrust::exclusive_scan( - rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); - - // adjust the cumulative info such that for each row count, the size includes any pages that span - // that row count. this is so that if we have this case: - // page row counts - // Column A: 0 <----> 100 <----> 200 - // Column B: 0 <---------------> 200 <--------> 400 - // | - // if we decide to split at row 100, we don't really know the actual amount of bytes in column B - // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - // page. - // - rmm::device_uvector aggregated_info(c_info.size(), stream); - thrust::transform(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - aggregated_info.begin(), - row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); - - // bring back to the cpu - std::vector h_aggregated_info(aggregated_info.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), - aggregated_info.data(), - sizeof(cumulative_row_info) * c_info.size(), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - return find_splits(h_aggregated_info, num_rows, chunk_read_limit); -} - struct get_page_chunk_idx { __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; @@ -1822,12 +1272,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->page_keys = std::move(page_keys); _pass_itm_data->page_index = std::move(page_index); - // compute splits if necessary. otherwise return a single split representing - // the whole file. - _pass_itm_data->output_chunk_read_info = - _output_chunk_read_limit > 0 - ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + // compute splits for the pass + compute_splits_for_pass(); } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index f3a43cbc63c..dd049d401cf 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -51,19 +51,21 @@ std::unique_ptr gather_column_buffer::make_string_column_impl(rmm::cuda_ return make_strings_column(*_strings, stream, _mr); } -void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) { CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column"); // size + 1 for final offset. _string_data will be initialized later. _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr); } -void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + rmm::cuda_stream_view stream) { _string_data = rmm::device_buffer(num_bytes, stream, _mr); } -std::unique_ptr inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream) +std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_column_impl( + rmm::cuda_stream_view stream) { // no need for copies, just transfer ownership of the data_buffers to the columns auto const state = mask_state::UNALLOCATED; @@ -324,7 +326,7 @@ std::unique_ptr empty_like(column_buffer_base& buffer, } using pointer_type = gather_column_buffer; -using string_type = inline_column_buffer; +using string_type = cudf::io::detail::inline_column_buffer; using pointer_column_buffer = column_buffer_base; using string_column_buffer = column_buffer_base; From 053da82810ad78286602cfd09e37f8a22cb0a15b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 10 Oct 2023 13:28:01 -0400 Subject: [PATCH 04/11] Make parquet schema index type consistent (#14256) While working on parquet schema issue I noticed that the parent and child index didn't match. Discussion ensued and `size_type` was decided. Authors: - Mike Wilson (https://github.com/hyperbolic2346) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14256 --- cpp/src/io/parquet/parquet.hpp | 6 ++++-- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c5993d73dec..dbec59670c7 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,6 +18,8 @@ #include "parquet_common.hpp" +#include + #include #include @@ -152,8 +154,8 @@ struct SchemaElement { // The following fields are filled in later during schema initialization int max_definition_level = 0; int max_repetition_level = 0; - int parent_idx = 0; - std::vector children_idx; + size_type parent_idx = 0; + std::vector children_idx; bool operator==(SchemaElement const& other) const { diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 171cf07da3e..040c6403f57 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -213,7 +213,7 @@ void metadata::sanitize_schema() // add a struct child and move this element's children to the struct schema_elem.converted_type = LIST; schema_elem.repetition_type = OPTIONAL; - auto const struct_node_idx = schema.size(); + auto const struct_node_idx = static_cast(schema.size()); SchemaElement struct_elem; struct_elem.name = "struct_node"; From 5039d043a08e7ea7e5656bab60a6fced4dfa2f1d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:06:24 -0400 Subject: [PATCH 05/11] Expose stream parameter in public strings APIs (#14260) Add stream parameter to public APIs: - `cudf::strings::strip()` - `cudf::strings::slice_strings()` - `cudf::strings::pad()` - `cudf::strings::zfill()` - `cudf::strings::wrap()` Also cleaned up some of the doxygen comments and added stream-tests. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14260 --- cpp/include/cudf/strings/padding.hpp | 14 ++-- cpp/include/cudf/strings/slice.hpp | 30 +++++---- cpp/include/cudf/strings/strip.hpp | 4 +- cpp/include/cudf/strings/wrap.hpp | 14 ++-- cpp/src/strings/padding.cu | 6 +- cpp/src/strings/slice.cu | 7 +- cpp/src/strings/strip.cu | 5 +- cpp/src/strings/wrap.cu | 8 +-- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/streams/strings/strings_tests.cpp | 71 +++++++++++++++++++++ 10 files changed, 125 insertions(+), 38 deletions(-) create mode 100644 cpp/tests/streams/strings/strings_tests.cpp diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp index 7699159fbea..f0cb351eeda 100644 --- a/cpp/include/cudf/strings/padding.hpp +++ b/cpp/include/cudf/strings/padding.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,6 +51,7 @@ namespace strings { * Default is pad right (left justify) * @param fill_char Single UTF-8 character to use for padding; * Default is the space character + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with padded strings */ @@ -59,6 +60,7 @@ std::unique_ptr pad( size_type width, side_type side = side_type::RIGHT, std::string_view fill_char = " ", + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -79,14 +81,16 @@ std::unique_ptr pad( * r is now ['001234','-09876','+00.34','-342567', '0002+2'] * @endcode * - * @param input Strings instance for this operation. - * @param width The minimum number of characters for each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of strings. + * @param input Strings instance for this operation + * @param width The minimum number of characters for each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of strings */ std::unique_ptr zfill( strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp index 5f2c71725eb..f106663be9b 100644 --- a/cpp/include/cudf/strings/slice.hpp +++ b/cpp/include/cudf/strings/slice.hpp @@ -50,18 +50,20 @@ namespace strings { * r2 is now ["lo","ob"] * @endcode * - * @param strings Strings column for this operation. - * @param start First character position to begin the substring. - * @param stop Last character position (exclusive) to end the substring. - * @param step Distance between input characters retrieved. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param start First character position to begin the substring + * @param stop Last character position (exclusive) to end the substring + * @param step Distance between input characters retrieved + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, numeric_scalar const& start = numeric_scalar(0, false), numeric_scalar const& stop = numeric_scalar(0, false), numeric_scalar const& step = numeric_scalar(1), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -95,16 +97,18 @@ std::unique_ptr slice_strings( * @throw cudf::logic_error if starts and stops are not same integer type. * @throw cudf::logic_error if starts or stops contains nulls. * - * @param strings Strings column for this operation. - * @param starts First character positions to begin the substring. - * @param stops Last character (exclusive) positions to end the substring. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param starts First character positions to begin the substring + * @param stops Last character (exclusive) positions to end the substring + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, column_view const& starts, column_view const& stops, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp index adf3b291144..556d6805ac3 100644 --- a/cpp/include/cudf/strings/strip.hpp +++ b/cpp/include/cudf/strings/strip.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,6 +57,7 @@ namespace strings { * string; Default is both * @param to_strip UTF-8 encoded characters to strip from each string; * Default is empty string which indicates strip whitespace characters + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ @@ -64,6 +65,7 @@ std::unique_ptr strip( strings_column_view const& input, side_type side = side_type::BOTH, string_scalar const& to_strip = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp index 8d2d43c7f0f..efdc3e62aff 100644 --- a/cpp/include/cudf/strings/wrap.hpp +++ b/cpp/include/cudf/strings/wrap.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,14 +57,16 @@ namespace strings { * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"] * ``` * - * @param[in] strings String column. - * @param[in] width Maximum character width of a line within each string. - * @param[in] mr Device memory resource used to allocate the returned column's device memory - * @return Column of wrapped strings. + * @param input String column + * @param width Maximum character width of a line within each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of wrapped strings */ std::unique_ptr wrap( - strings_column_view const& strings, + strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index c501a8bf7b4..850ccaa4535 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -168,18 +168,20 @@ std::unique_ptr pad(strings_column_view const& input, size_type width, side_type side, std::string_view fill_char, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr); + return detail::pad(input, width, side, fill_char, stream, mr); } std::unique_ptr zfill(strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::zfill(input, width, cudf::get_default_stream(), mr); + return detail::zfill(input, width, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index cce6a19a5a6..5a1fee92c7d 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -248,20 +248,21 @@ std::unique_ptr slice_strings(strings_column_view const& strings, numeric_scalar const& start, numeric_scalar const& stop, numeric_scalar const& step, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, start, stop, step, stream, mr); } std::unique_ptr slice_strings(strings_column_view const& strings, column_view const& starts_column, column_view const& stops_column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings( - strings, starts_column, stops_column, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, starts_column, stops_column, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index 6fb7c671a87..26df76850f7 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,10 +86,11 @@ std::unique_ptr strip(strings_column_view const& input, std::unique_ptr strip(strings_column_view const& input, side_type side, string_scalar const& to_strip, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr); + return detail::strip(input, side, to_strip, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index 335908d65d1..aa87a663964 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,9 @@ #include #include #include -#include -#include #include #include +#include #include #include @@ -133,10 +132,11 @@ std::unique_ptr wrap(strings_column_view const& strings, std::unique_ptr wrap(strings_column_view const& strings, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::wrap(strings, width, cudf::get_default_stream(), mr); + return detail::wrap(strings, width, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ffaba7d6fa7..b15a6c41d39 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -633,8 +633,8 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest( - STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE - testing + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp + streams/strings/strings_tests.cpp STREAM_MODE testing ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp new file mode 100644 index 00000000000..0db467a6895 --- /dev/null +++ b/cpp/tests/streams/strings/strings_tests.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsTest, Strip) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::strip(view, side, strip, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Pad) +{ + auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"}); + auto view = cudf::strings_column_view(input); + + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream()); + cudf::strings::zfill(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Wrap) +{ + auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::wrap(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Slice) +{ + auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"}); + auto view = cudf::strings_column_view(input); + + auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream()); + auto stop = cudf::numeric_scalar(5, true, cudf::test::get_default_stream()); + auto step = cudf::numeric_scalar(1, true, cudf::test::get_default_stream()); + cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream()); + + auto starts = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto stops = cudf::test::fixed_width_column_wrapper({4, 5, 6}); + cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream()); +} From c0c7ed8405c679752439081ee1b42b22658264c9 Mon Sep 17 00:00:00 2001 From: Martin Marenz Date: Wed, 11 Oct 2023 00:04:58 +0200 Subject: [PATCH 06/11] Add `bytes_per_second` to transpose benchmark (#14170) This patch relates to #13735. Benchmark: [transpose_benchmark.txt](https://github.com/rapidsai/cudf/files/12699834/transpose_benchmark.txt) Authors: - Martin Marenz (https://github.com/Blonck) - Mark Harris (https://github.com/harrism) Approvers: - Mark Harris (https://github.com/harrism) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/14170 --- cpp/benchmarks/transpose/transpose.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/transpose/transpose.cpp b/cpp/benchmarks/transpose/transpose.cpp index 2f41bda4b88..c2737325462 100644 --- a/cpp/benchmarks/transpose/transpose.cpp +++ b/cpp/benchmarks/transpose/transpose.cpp @@ -20,17 +20,19 @@ #include #include #include +#include #include #include static void BM_transpose(benchmark::State& state) { - auto count = state.range(0); + auto count = state.range(0); + constexpr auto column_type_id = cudf::type_id::INT32; auto int_column_generator = thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) { return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID); + cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID); }); auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count)); @@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state) cuda_event_timer raii(state, true); auto output = cudf::transpose(input); } + + // Collect memory statistics. + auto const bytes_read = static_cast(input.num_columns()) * input.num_rows() * + sizeof(cudf::id_to_type); + auto const bytes_written = bytes_read; + // Account for nullability in input and output. + auto const null_bytes = 2 * static_cast(input.num_columns()) * + cudf::bitmask_allocation_size_bytes(input.num_rows()); + + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_read + bytes_written + null_bytes)); } class Transpose : public cudf::benchmark {}; From 0ed7725416879a824ee5b96292eda2d1048a9ada Mon Sep 17 00:00:00 2001 From: Martin Marenz Date: Wed, 11 Oct 2023 00:06:23 +0200 Subject: [PATCH 07/11] Add `bytes_per_second` to shift benchmark (#13950) Adds `bytes_per_second` to `SHIFT_BENCH`. This patch relates to #13735. Authors: - Martin Marenz (https://github.com/Blonck) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/13950 --- cpp/benchmarks/copying/shift.cu | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu index 460100a8fe9..e1169e3bcd6 100644 --- a/cpp/benchmarks/copying/shift.cu +++ b/cpp/benchmarks/copying/shift.cu @@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state) cudf::size_type size = state.range(0); cudf::size_type offset = size * (static_cast(shift_factor) / 100.0); - auto const input_table = - create_sequence_table({cudf::type_to_id()}, - row_count{size}, - use_validity ? std::optional{1.0} : std::nullopt); + auto constexpr column_type_id = cudf::type_id::INT32; + using column_type = cudf::id_to_type; + + auto const input_table = create_sequence_table( + {column_type_id}, row_count{size}, use_validity ? std::optional{1.0} : std::nullopt); cudf::column_view input{input_table->get_column(0)}; - auto fill = use_validity ? make_scalar() : make_scalar(777); + auto fill = use_validity ? make_scalar() : make_scalar(777); for (auto _ : state) { cuda_event_timer raii(state, true); auto output = cudf::shift(input, offset, *fill); } + + auto const elems_read = (size - offset); + auto const bytes_read = elems_read * sizeof(column_type); + + // If 'use_validity' is false, the fill value is a number, and the entire column + // (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity' + // is true, only the elements that can be shifted are written, along with the full null bitmask. + auto const elems_written = use_validity ? (size - offset) : size; + auto const bytes_written = elems_written * sizeof(column_type); + auto const null_bytes = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0; + + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_written + bytes_read + null_bytes)); } class Shift : public cudf::benchmark {}; From aa8b0f8e4e71a8e2b076656e0a8bf00bfc15ecb8 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Tue, 10 Oct 2023 16:14:51 -0700 Subject: [PATCH 08/11] Handle empty string correctly in Parquet statistics (#14257) An empty string should be a valid minimum value for a string column, but the current parquet writer considers an empty string to have no value when writing the column chunk statistics. This PR changes all fields in the Statistics struct to be `thrust::optional` to help distinguish between a valid empty string and no value. Authors: - Ed Seidl (https://github.com/etseidl) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14257 --- .../io/parquet/compact_protocol_reader.cpp | 15 ++-- .../io/parquet/compact_protocol_writer.cpp | 12 +-- cpp/src/io/parquet/parquet.hpp | 18 ++-- cpp/src/io/parquet/predicate_pushdown.cpp | 14 +-- cpp/tests/io/parquet_test.cpp | 85 +++++++++++++++---- 5 files changed, 104 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 81d1be64a45..1a345ee0750 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -767,12 +767,15 @@ bool CompactProtocolReader::read(ColumnIndex* c) bool CompactProtocolReader::read(Statistics* s) { - auto op = std::make_tuple(parquet_field_binary(1, s->max), - parquet_field_binary(2, s->min), - parquet_field_int64(3, s->null_count), - parquet_field_int64(4, s->distinct_count), - parquet_field_binary(5, s->max_value), - parquet_field_binary(6, s->min_value)); + using optional_binary = parquet_field_optional, parquet_field_binary>; + using optional_int64 = parquet_field_optional; + + auto op = std::make_tuple(optional_binary(1, s->max), + optional_binary(2, s->min), + optional_int64(3, s->null_count), + optional_int64(4, s->distinct_count), + optional_binary(5, s->max_value), + optional_binary(6, s->min_value)); return function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 9adc8767880..00810269d3c 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -195,12 +195,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s) size_t CompactProtocolWriter::write(Statistics const& s) { CompactProtocolFieldWriter c(*this); - if (not s.max.empty()) { c.field_binary(1, s.max); } - if (not s.min.empty()) { c.field_binary(2, s.min); } - if (s.null_count != -1) { c.field_int(3, s.null_count); } - if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); } - if (not s.max_value.empty()) { c.field_binary(5, s.max_value); } - if (not s.min_value.empty()) { c.field_binary(6, s.min_value); } + if (s.max.has_value()) { c.field_binary(1, s.max.value()); } + if (s.min.has_value()) { c.field_binary(2, s.min.value()); } + if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); } + if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } + if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } + if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } return c.value(); } diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index dbec59670c7..1cd16ac6102 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -215,12 +215,18 @@ struct SchemaElement { * @brief Thrift-derived struct describing column chunk statistics */ struct Statistics { - std::vector max; // deprecated max value in signed comparison order - std::vector min; // deprecated min value in signed comparison order - int64_t null_count = -1; // count of null values in the column - int64_t distinct_count = -1; // count of distinct values occurring - std::vector max_value; // max value for column determined by ColumnOrder - std::vector min_value; // min value for column determined by ColumnOrder + // deprecated max value in signed comparison order + thrust::optional> max; + // deprecated min value in signed comparison order + thrust::optional> min; + // count of null values in the column + thrust::optional null_count; + // count of distinct values occurring + thrust::optional distinct_count; + // max value for column determined by ColumnOrder + thrust::optional> max_value; + // min value for column determined by ColumnOrder + thrust::optional> min_value; }; /** diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 9083be1c2dd..a5851de3c20 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -150,12 +150,14 @@ struct stats_caster { { } - void set_index(size_type index, std::vector const& binary_value, Type const type) + void set_index(size_type index, + thrust::optional> const& binary_value, + Type const type) { - if (!binary_value.empty()) { - val[index] = convert(binary_value.data(), binary_value.size(), type); + if (binary_value.has_value()) { + val[index] = convert(binary_value.value().data(), binary_value.value().size(), type); } - if (binary_value.empty()) { + if (not binary_value.has_value()) { clear_bit_unsafe(null_mask.data(), index); null_count++; } @@ -210,10 +212,10 @@ struct stats_caster { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; auto const& colchunk = row_group.columns[col_idx]; // To support deprecated min, max fields. - auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0 + auto const& min_value = colchunk.meta_data.statistics.min_value.has_value() ? colchunk.meta_data.statistics.min_value : colchunk.meta_data.statistics.min; - auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0 + auto const& max_value = colchunk.meta_data.statistics.max_value.has_value() ? colchunk.meta_data.statistics.max_value : colchunk.meta_data.statistics.max; // translate binary data to Type then to diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 3e5d7033e60..fa85e3a4a1d 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -4161,8 +4161,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); } } } @@ -4242,6 +4244,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; @@ -4250,10 +4255,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) EXPECT_FALSE(ci.null_pages[p]); // null_counts should always be 0 EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4344,7 +4349,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) auto const stats = get_statistics(chunk); // should be half nulls, except no nulls in column 0 - EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4356,10 +4364,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) } else { EXPECT_EQ(ci.null_counts[p], 0); } - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4436,7 +4444,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) auto const stats = get_statistics(chunk); // there should be no nulls except column 1 which is all nulls - EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0); + if (c != 1) { + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + } + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4449,12 +4462,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) } if (not ci.null_pages[p]) { EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } } for (size_t p = 0; p < ci.max_values.size(); p++) { if (not ci.null_pages[p]) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4533,13 +4546,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const ptype = fmd.schema[colidx].type; auto const ctype = fmd.schema[colidx].converted_type; for (size_t p = 0; p < ci.min_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4829,11 +4845,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0); @@ -4890,8 +4909,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(ci.min_values[0], truncated_min[c]); @@ -6737,6 +6758,38 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +TEST_F(ParquetWriterTest, EmptyMinStringStatistics) +{ + char const* const min_val = ""; + char const* const max_val = "zzz"; + std::vector strings{min_val, max_val, "pining", "for", "the", "fjords"}; + + column_wrapper string_col{strings.begin(), strings.end()}; + auto const output = table_view{{string_col}}; + auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0); + auto const& chunk = fmd.row_groups[0].columns[0]; + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const min_value = std::string{reinterpret_cast(stats.min_value.value().data()), + stats.min_value.value().size()}; + auto const max_value = std::string{reinterpret_cast(stats.max_value.value().data()), + stats.max_value.value().size()}; + EXPECT_EQ(min_value, std::string(min_val)); + EXPECT_EQ(max_value, std::string(max_val)); +} + TEST_F(ParquetReaderTest, RepeatedNoAnnotations) { constexpr unsigned char repeated_bytes[] = { From b17904dbaa4de1a162fcb4a0f64862f9f83b976f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 10 Oct 2023 19:51:02 -0500 Subject: [PATCH 09/11] Add in java bindings for DataSource (#14254) This PR adds DataSource Java bindings. It also fixes a small bug in CUDF that made it so the bindings would not work for anything but CSV. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/14254 --- cpp/src/io/utilities/datasource.cpp | 8 + java/src/main/java/ai/rapids/cudf/Cuda.java | 24 +- .../main/java/ai/rapids/cudf/DataSource.java | 189 ++++++++++++++ .../java/ai/rapids/cudf/DataSourceHelper.java | 44 ++++ .../ai/rapids/cudf/DeviceMemoryBuffer.java | 6 +- .../ai/rapids/cudf/MultiBufferDataSource.java | 230 +++++++++++++++++ .../ai/rapids/cudf/ParquetChunkedReader.java | 45 +++- java/src/main/java/ai/rapids/cudf/Table.java | 99 +++++++- java/src/main/native/CMakeLists.txt | 1 + java/src/main/native/src/ChunkedReaderJni.cpp | 36 ++- java/src/main/native/src/CudfJni.cpp | 8 + .../main/native/src/DataSourceHelperJni.cpp | 237 ++++++++++++++++++ java/src/main/native/src/TableJni.cpp | 212 +++++++++++++++- java/src/main/native/src/cudf_jni_apis.hpp | 8 + .../test/java/ai/rapids/cudf/TableTest.java | 225 +++++++++++++++++ 15 files changed, 1358 insertions(+), 14 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/DataSource.java create mode 100644 java/src/main/java/ai/rapids/cudf/DataSourceHelper.java create mode 100644 java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java create mode 100644 java/src/main/native/src/DataSourceHelperJni.cpp diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 7a7121aa91d..5cdd92ce3b7 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -375,6 +375,14 @@ class user_datasource_wrapper : public datasource { return source->device_read(offset, size, stream); } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + return source->device_read_async(offset, size, dst, stream); + } + [[nodiscard]] size_t size() const override { return source->size(); } private: diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java index e1298e29925..7cc3d30a9cf 100755 --- a/java/src/main/java/ai/rapids/cudf/Cuda.java +++ b/java/src/main/java/ai/rapids/cudf/Cuda.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ */ package ai.rapids.cudf; -import ai.rapids.cudf.NvtxColor; -import ai.rapids.cudf.NvtxRange; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -90,6 +87,21 @@ private Stream() { this.id = -1; } + private Stream(long id) { + this.cleaner = null; + this.id = id; + } + + /** + * Wrap a given stream ID to make it accessible. + */ + static Stream wrap(long id) { + if (id == -1) { + return DEFAULT_STREAM; + } + return new Stream(id); + } + /** * Have this stream not execute new work until the work recorded in event completes. * @param event the event to wait on. @@ -122,7 +134,9 @@ public synchronized void close() { cleaner.delRef(); } if (closed) { - cleaner.logRefCountDebug("double free " + this); + if (cleaner != null) { + cleaner.logRefCountDebug("double free " + this); + } throw new IllegalStateException("Close called too many times " + this); } if (cleaner != null) { diff --git a/java/src/main/java/ai/rapids/cudf/DataSource.java b/java/src/main/java/ai/rapids/cudf/DataSource.java new file mode 100644 index 00000000000..1e5893235df --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DataSource.java @@ -0,0 +1,189 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; + +/** + * Base class that can be used to provide data dynamically to CUDF. This follows somewhat + * closely with cudf::io::datasource. There are a few main differences. + *
+ * First this does not expose async device reads. It will call the non-async device read API + * instead. This might be added in the future, but there was no direct use case for it in java + * right now to warrant the added complexity. + *
+ * Second there is no implementation of the device read API that returns a buffer instead of + * writing into one. This is not used by CUDF yet so testing an implementation that isn't used + * didn't feel ideal. If it is needed we will add one in the future. + */ +public abstract class DataSource implements AutoCloseable { + private static final Logger log = LoggerFactory.getLogger(DataSource.class); + + /** + * This is used to keep track of the HostMemoryBuffers in java land so the C++ layer + * does not have to do it. + */ + private final HashMap cachedBuffers = new HashMap<>(); + + @Override + public void close() { + if (!cachedBuffers.isEmpty()) { + throw new IllegalStateException("DataSource closed before all returned host buffers were closed"); + } + } + + /** + * Get the size of the source in bytes. + */ + public abstract long size(); + + /** + * Read data from the source at the given offset. Return a HostMemoryBuffer for the data + * that was read. + * @param offset where to start reading from. + * @param amount the maximum number of bytes to read. + * @return a buffer that points to the data. + * @throws IOException on any error. + */ + public abstract HostMemoryBuffer hostRead(long offset, long amount) throws IOException; + + + /** + * Called when the buffer returned from hostRead is done. The default is to close the buffer. + */ + protected void onHostBufferDone(HostMemoryBuffer buffer) { + if (buffer != null) { + buffer.close(); + } + } + + /** + * Read data from the source at the given offset into dest. Note that dest should not be closed, + * and no reference to it can outlive the call to hostRead. The target amount to read is + * dest's length. + * @param offset the offset to start reading from in the source. + * @param dest where to write the data. + * @return the actual number of bytes written to dest. + */ + public abstract long hostRead(long offset, HostMemoryBuffer dest) throws IOException; + + /** + * Return true if this supports reading directly to the device else false. The default is + * no device support. This cannot change dynamically. It is typically read just once. + */ + public boolean supportsDeviceRead() { + return false; + } + + /** + * Get the size cutoff between device reads and host reads when device reads are supported. + * Anything larger than the cutoff will be a device read and anything smaller will be a + * host read. By default, the cutoff is 0 so all reads will be device reads if device reads + * are supported. + */ + public long getDeviceReadCutoff() { + return 0; + } + + /** + * Read data from the source at the given offset into dest. Note that dest should not be closed, + * and no reference to it can outlive the call to hostRead. The target amount to read is + * dest's length. + * @param offset the offset to start reading from + * @param dest where to write the data. + * @param stream the stream to do the copy on. + * @return the actual number of bytes written to dest. + */ + public long deviceRead(long offset, DeviceMemoryBuffer dest, + Cuda.Stream stream) throws IOException { + throw new IllegalStateException("Device read is not implemented"); + } + + ///////////////////////////////////////////////// + // Internal methods called from JNI + ///////////////////////////////////////////////// + + private static class NoopCleaner extends MemoryBuffer.MemoryBufferCleaner { + @Override + protected boolean cleanImpl(boolean logErrorIfNotClean) { + return true; + } + + @Override + public boolean isClean() { + return true; + } + } + private static final NoopCleaner cleaner = new NoopCleaner(); + + // Called from JNI + private void onHostBufferDone(long bufferId) { + HostMemoryBuffer hmb = cachedBuffers.remove(bufferId); + if (hmb != null) { + onHostBufferDone(hmb); + } else { + // Called from C++ destructor so avoid throwing... + log.warn("Got a close callback for a buffer we could not find " + bufferId); + } + } + + // Called from JNI + private long hostRead(long offset, long amount, long dst) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot allocate more than " + Long.MAX_VALUE + " bytes"); + } + try (HostMemoryBuffer dstBuffer = new HostMemoryBuffer(dst, amount, cleaner)) { + return hostRead(offset, dstBuffer); + } + } + + // Called from JNI + private long[] hostReadBuff(long offset, long amount) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes"); + } + HostMemoryBuffer buff = hostRead(offset, amount); + long[] ret = new long[3]; + if (buff != null) { + long id = buff.id; + if (cachedBuffers.put(id, buff) != null) { + throw new IllegalStateException("Already had a buffer cached for " + buff); + } + ret[0] = buff.address; + ret[1] = buff.length; + ret[2] = id; + } // else they are all 0 because java does that already + return ret; + } + + // Called from JNI + private long deviceRead(long offset, long amount, long dst, long stream) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes"); + } + Cuda.Stream strm = Cuda.Stream.wrap(stream); + try (DeviceMemoryBuffer dstBuffer = new DeviceMemoryBuffer(dst, amount, cleaner)) { + return deviceRead(offset, dstBuffer, strm); + } + } +} diff --git a/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java new file mode 100644 index 00000000000..5d4dcb8e4e7 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java @@ -0,0 +1,44 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * This is here because we need some JNI methods to work with a DataSource, but + * we also want to cache callback methods at startup for performance reasons. If + * we put both in the same class we will get a deadlock because of how we load + * the JNI. We have a static block that blocks loading the class until the JNI + * library is loaded and the JNI library cannot load until the class is loaded + * and cached. This breaks the loop. + */ +class DataSourceHelper { + static { + NativeDepsLoader.loadNativeDeps(); + } + + static long createWrapperDataSource(DataSource ds) { + return createWrapperDataSource(ds, ds.size(), ds.supportsDeviceRead(), + ds.getDeviceReadCutoff()); + } + + private static native long createWrapperDataSource(DataSource ds, long size, + boolean deviceReadSupport, + long deviceReadCutoff); + + static native void destroyWrapperDataSource(long handle); +} diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java index c4d9bdb8f91..9eab607ed0b 100644 --- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,6 +112,10 @@ public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress); } + DeviceMemoryBuffer(long address, long lengthInBytes, MemoryBufferCleaner cleaner) { + super(address, lengthInBytes, cleaner); + } + DeviceMemoryBuffer(long address, long lengthInBytes, long rmmBufferAddress) { super(address, lengthInBytes, new RmmDeviceBufferCleaner(rmmBufferAddress)); } diff --git a/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java new file mode 100644 index 00000000000..6986b6a7fec --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java @@ -0,0 +1,230 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * This is a DataSource that can take multiple HostMemoryBuffers. They + * are treated as if they are all part of a single file connected end to end. + */ +public class MultiBufferDataSource extends DataSource { + private final long sizeInBytes; + private final HostMemoryBuffer[] hostBuffers; + private final long[] startOffsets; + private final HostMemoryAllocator allocator; + + // Metrics + private long hostReads = 0; + private long hostReadBytes = 0; + private long devReads = 0; + private long devReadBytes = 0; + + /** + * Create a new data source backed by multiple buffers. + * @param buffers the buffers that will back the data source. + */ + public MultiBufferDataSource(HostMemoryBuffer ... buffers) { + this(DefaultHostMemoryAllocator.get(), buffers); + } + + /** + * Create a new data source backed by multiple buffers. + * @param allocator the allocator to use for host buffers, if needed. + * @param buffers the buffers that will back the data source. + */ + public MultiBufferDataSource(HostMemoryAllocator allocator, HostMemoryBuffer ... buffers) { + int numBuffers = buffers.length; + hostBuffers = new HostMemoryBuffer[numBuffers]; + startOffsets = new long[numBuffers]; + + long currentOffset = 0; + for (int i = 0; i < numBuffers; i++) { + HostMemoryBuffer hmb = buffers[i]; + hmb.incRefCount(); + hostBuffers[i] = hmb; + startOffsets[i] = currentOffset; + currentOffset += hmb.getLength(); + } + sizeInBytes = currentOffset; + this.allocator = allocator; + } + + @Override + public long size() { + return sizeInBytes; + } + + private int getStartBufferIndexForOffset(long offset) { + assert (offset >= 0); + + // It is super common to read from the start or end of a file (the header or footer) + // so special case them + if (offset == 0) { + return 0; + } + int startIndex = 0; + int endIndex = startOffsets.length - 1; + if (offset >= startOffsets[endIndex]) { + return endIndex; + } + while (startIndex != endIndex) { + int midIndex = (int)(((long)startIndex + endIndex) / 2); + long midStartOffset = startOffsets[midIndex]; + if (offset >= midStartOffset) { + // It is either in mid or after mid. + if (midIndex == endIndex || offset <= startOffsets[midIndex + 1]) { + // We found it in mid + return midIndex; + } else { + // It is after mid + startIndex = midIndex + 1; + } + } else { + // It is before mid + endIndex = midIndex - 1; + } + } + return startIndex; + } + + + interface DoCopy { + void copyFromHostBuffer(T dest, long destOffset, HostMemoryBuffer src, + long srcOffset, long srcAmount); + } + + private long read(long offset, T dest, DoCopy doCopy) { + assert (offset >= 0); + long realOffset = Math.min(offset, sizeInBytes); + long realAmount = Math.min(sizeInBytes - realOffset, dest.getLength()); + + int index = getStartBufferIndexForOffset(realOffset); + + HostMemoryBuffer buffer = hostBuffers[index]; + long bufferOffset = realOffset - startOffsets[index]; + long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount); + long remainingAmount = realAmount; + long currentOffset = realOffset; + long outputOffset = 0; + + while (remainingAmount > 0) { + doCopy.copyFromHostBuffer(dest, outputOffset, buffer, + bufferOffset, bufferAmount); + remainingAmount -= bufferAmount; + outputOffset += bufferAmount; + currentOffset += bufferAmount; + index++; + if (index < hostBuffers.length) { + buffer = hostBuffers[index]; + bufferOffset = currentOffset - startOffsets[index]; + bufferAmount = Math.min(buffer.length - bufferOffset, remainingAmount); + } + } + + return realAmount; + } + + @Override + public HostMemoryBuffer hostRead(long offset, long amount) { + assert (offset >= 0); + assert (amount >= 0); + long realOffset = Math.min(offset, sizeInBytes); + long realAmount = Math.min(sizeInBytes - realOffset, amount); + + int index = getStartBufferIndexForOffset(realOffset); + + HostMemoryBuffer buffer = hostBuffers[index]; + long bufferOffset = realOffset - startOffsets[index]; + long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount); + if (bufferAmount == realAmount) { + hostReads += 1; + hostReadBytes += realAmount; + // It all fits in a single buffer, so do a zero copy operation + return buffer.slice(bufferOffset, bufferAmount); + } else { + // We will have to allocate a new buffer and copy data into it. + boolean success = false; + HostMemoryBuffer ret = allocator.allocate(realAmount, true); + try { + long amountRead = read(offset, ret, HostMemoryBuffer::copyFromHostBuffer); + assert(amountRead == realAmount); + hostReads += 1; + hostReadBytes += amountRead; + success = true; + return ret; + } finally { + if (!success) { + ret.close(); + } + } + } + } + + @Override + public long hostRead(long offset, HostMemoryBuffer dest) { + long ret = read(offset, dest, HostMemoryBuffer::copyFromHostBuffer); + hostReads += 1; + hostReadBytes += ret; + return ret; + } + + @Override + public boolean supportsDeviceRead() { + return true; + } + + @Override + public long deviceRead(long offset, DeviceMemoryBuffer dest, + Cuda.Stream stream) { + long ret = read(offset, dest, (destParam, destOffset, src, srcOffset, srcAmount) -> + destParam.copyFromHostBufferAsync(destOffset, src, srcOffset, srcAmount, stream)); + devReads += 1; + devReadBytes += ret; + return ret; + } + + + @Override + public void close() { + try { + super.close(); + } finally { + for (HostMemoryBuffer hmb: hostBuffers) { + if (hmb != null) { + hmb.close(); + } + } + } + } + + public long getHostReads() { + return hostReads; + } + + public long getHostReadBytes() { + return hostReadBytes; + } + + public long getDevReads() { + return devReads; + } + + public long getDevReadBytes() { + return devReadBytes; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index c34336ac73f..17d59b757c3 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } } @@ -71,18 +71,45 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } } + /** + * Construct a reader instance from a DataSource + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param opts The options for Parquet reading. + * @param ds the data source to read from + */ + public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, DataSource ds) { + dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + if (dataSourceHandle == 0) { + throw new IllegalStateException("Cannot create native datasource object"); + } + + boolean passed = false; + try { + handle = createWithDataSource(chunkSizeByteLimit, opts.getIncludeColumnNames(), + opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(), + dataSourceHandle); + passed = true; + } finally { + if (!passed) { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + dataSourceHandle = 0; + } + } + } + /** * Check if the given file has anything left to read. * * @return A boolean value indicating if there is more data to read from file. */ public boolean hasNext() { - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Native chunked Parquet reader object may have been closed."); } @@ -104,7 +131,7 @@ public boolean hasNext() { * @return A table of new rows reading from the given file. */ public Table readChunk() { - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Native chunked Parquet reader object may have been closed."); } @@ -118,6 +145,10 @@ public void close() { close(handle); handle = 0; } + if (dataSourceHandle != 0) { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + dataSourceHandle = 0; + } } @@ -131,6 +162,7 @@ public void close() { */ private long handle; + private long dataSourceHandle = 0; /** * Create a native chunked Parquet reader object on heap and return its memory address. @@ -147,6 +179,9 @@ public void close() { private static native long create(long chunkSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit); + private static native long createWithDataSource(long chunkedSizeByteLimit, + String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); + private static native boolean hasNext(long handle); private static native long[] readChunk(long handle); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 51a33ebb72f..3bd1e3f25a7 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -235,6 +235,14 @@ private static native long[] readCSV(String[] columnNames, byte comment, String[] nullValues, String[] trueValues, String[] falseValues) throws CudfException; + private static native long[] readCSVFromDataSource(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, + String[] filterColumnNames, + int headerRow, byte delim, int quoteStyle, byte quote, + byte comment, String[] nullValues, + String[] trueValues, String[] falseValues, + long dataSourceHandle) throws CudfException; + /** * read JSON data and return a pointer to a TableWithMeta object. */ @@ -244,6 +252,12 @@ private static native long readJSON(String[] columnNames, boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException; + private static native long readJSONFromDataSource(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, + boolean dayFirst, boolean lines, + boolean recoverWithNulls, + long dsHandle) throws CudfException; + private static native long readAndInferJSON(long address, long length, boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException; @@ -260,6 +274,10 @@ private static native long readAndInferJSON(long address, long length, private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, long address, long length, int timeUnit) throws CudfException; + private static native long[] readParquetFromDataSource(String[] filterColumnNames, + boolean[] binaryToString, int timeUnit, + long dataSourceHandle) throws CudfException; + /** * Read in Avro formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read @@ -271,6 +289,9 @@ private static native long[] readParquet(String[] filterColumnNames, boolean[] b private static native long[] readAvro(String[] filterColumnNames, String filePath, long address, long length) throws CudfException; + private static native long[] readAvroFromDataSource(String[] filterColumnNames, + long dataSourceHandle) throws CudfException; + /** * Setup everything to write parquet formatted data to a file. * @param columnNames names that correspond to the table columns @@ -372,6 +393,11 @@ private static native long[] readORC(String[] filterColumnNames, boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns) throws CudfException; + private static native long[] readORCFromDataSource(String[] filterColumnNames, + boolean usingNumPyTypes, int timeUnit, + String[] decimal128Columns, + long dataSourceHandle) throws CudfException; + /** * Setup everything to write ORC formatted data to a file. * @param columnNames names that correspond to the table columns @@ -881,6 +907,27 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf opts.getFalseValues())); } + public static Table readCSV(Schema schema, CSVOptions opts, DataSource ds) { + long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readCSVFromDataSource(schema.getColumnNames(), + schema.getTypeIds(), + schema.getTypeScales(), + opts.getIncludeColumnNames(), + opts.getHeaderRow(), + opts.getDelim(), + opts.getQuoteStyle().nativeId, + opts.getQuote(), + opts.getComment(), + opts.getNullValues(), + opts.getTrueValues(), + opts.getFalseValues(), + dsHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dsHandle); + } + } + private static native void writeCSVToFile(long table, String[] columnNames, boolean includeHeader, @@ -1128,6 +1175,24 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b } } + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param ds the DataSource to read from. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { + long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), + schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), + opts.isRecoverWithNull(), dsHandle))) { + return gatherJSONColumns(schema, twm); + } finally { + DataSourceHelper.destroyWrapperDataSource(dsHandle); + } + } + /** * Read a Parquet file using the default ParquetOptions. * @param path the local file to read. @@ -1214,6 +1279,17 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } + public static Table readParquet(ParquetOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readParquetFromDataSource(opts.getIncludeColumnNames(), + opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(), + dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + /** * Read an Avro file using the default AvroOptions. * @param path the local file to read. @@ -1297,6 +1373,16 @@ public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer, null, buffer.getAddress() + offset, len)); } + public static Table readAvro(AvroOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readAvroFromDataSource(opts.getIncludeColumnNames(), + dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + /** * Read a ORC file using the default ORCOptions. * @param path the local file to read. @@ -1388,6 +1474,17 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer, opts.getDecimal128Columns())); } + public static Table readORC(ORCOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readORCFromDataSource(opts.getIncludeColumnNames(), + opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(), + opts.getDecimal128Columns(), dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + private static class ParquetTableWriter extends TableWriter { HostBufferConsumer consumer; @@ -2262,7 +2359,7 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean /** * Count how many rows in the table are distinct from one another. - * @param nullEqual if nulls should be considered equal to each other or not. + * @param nullsEqual if nulls should be considered equal to each other or not. */ public int distinctCount(NullEquality nullsEqual) { return distinctCount(nativeHandle, nullsEqual.nullsEqual); diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 0dcfee2cffe..01161a03dd4 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -135,6 +135,7 @@ add_library( src/ColumnViewJni.cu src/CompiledExpression.cpp src/ContiguousTableJni.cpp + src/DataSourceHelperJni.cpp src/HashJoinJni.cpp src/HostMemoryBufferNativeUtilsJni.cpp src/NvcompJni.cpp diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index 8d0a8bdbfe7..0044385f267 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,6 +85,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create( CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource( + JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names, + jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) { + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + // TODO: This variable is unused now, but we still don't know what to do with it yet. + // As such, it needs to stay here for a little more time before we decide to use it again, + // or remove it completely. + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); + (void)n_col_binary_read; + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto opts_builder = cudf::io::parquet_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector()); + } + auto const read_opts = opts_builder.convert_strings_to_categories(false) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); + + return reinterpret_cast(new cudf::io::chunked_parquet_reader( + static_cast(chunk_read_limit), read_opts)); + } + CATCH_STD(env, 0); +} + JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass, jlong handle) { JNI_NULL_CHECK(env, handle, "handle is null", false); diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp index 0f143086451..d0a25d449a6 100644 --- a/java/src/main/native/src/CudfJni.cpp +++ b/java/src/main/native/src/CudfJni.cpp @@ -175,6 +175,14 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) { return JNI_ERR; } + if (!cudf::jni::cache_data_source_jni(env)) { + if (!env->ExceptionCheck()) { + env->ThrowNew(env->FindClass("java/lang/RuntimeException"), + "Unable to locate data source helper methods needed by JNI"); + } + return JNI_ERR; + } + return cudf::jni::MINIMUM_JNI_VERSION; } diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp new file mode 100644 index 00000000000..8d0e4d36413 --- /dev/null +++ b/java/src/main/native/src/DataSourceHelperJni.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "cudf_jni_apis.hpp" +#include "jni_utils.hpp" + +namespace { + +#define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource" + +jclass DataSource_jclass; +jmethodID hostRead_method; +jmethodID hostReadBuff_method; +jmethodID onHostBufferDone_method; +jmethodID deviceRead_method; + +} // anonymous namespace + +namespace cudf { +namespace jni { +bool cache_data_source_jni(JNIEnv *env) { + jclass cls = env->FindClass(DATA_SOURCE_CLASS); + if (cls == nullptr) { + return false; + } + + hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J"); + if (hostRead_method == nullptr) { + return false; + } + + hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J"); + if (hostReadBuff_method == nullptr) { + return false; + } + + onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V"); + if (onHostBufferDone_method == nullptr) { + return false; + } + + deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J"); + if (deviceRead_method == nullptr) { + return false; + } + + // Convert local reference to global so it cannot be garbage collected. + DataSource_jclass = static_cast(env->NewGlobalRef(cls)); + if (DataSource_jclass == nullptr) { + return false; + } + return true; +} + +void release_data_source_jni(JNIEnv *env) { + DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass); +} + +class host_buffer_done_callback { +public: + explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {} + + host_buffer_done_callback(host_buffer_done_callback const &other) = delete; + host_buffer_done_callback(host_buffer_done_callback &&other) + : jvm(other.jvm), ds(other.ds), id(other.id) { + other.jvm = nullptr; + other.ds = nullptr; + other.id = -1; + } + + host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete; + host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete; + + ~host_buffer_done_callback() { + // because we are in a destructor we cannot throw an exception, so for now we are + // just going to keep the java exceptions around and have them be thrown when this + // thread returns to the JVM. It might be kind of confusing, but we will not lose + // them. + if (jvm != nullptr) { + // We cannot throw an exception in the destructor, so this is really best effort + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + env->CallVoidMethod(this->ds, onHostBufferDone_method, id); + } + } + } + +private: + JavaVM *jvm; + jobject ds; + long id; +}; + +class jni_datasource : public cudf::io::datasource { +public: + explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported, + size_t device_read_cutoff) + : ds_size(ds_size), device_read_supported(device_read_supported), + device_read_cutoff(device_read_cutoff) { + if (env->GetJavaVM(&jvm) < 0) { + throw std::runtime_error("GetJavaVM failed"); + } + this->ds = add_global_ref(env, ds); + } + + virtual ~jni_datasource() { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + ds = del_global_ref(env, ds); + } + ds = nullptr; + } + + std::unique_ptr host_read(size_t offset, size_t size) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlongArray jbuffer_info = + static_cast(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size)); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in hostRead"); + } + + cudf::jni::native_jlongArray buffer_info(env, jbuffer_info); + auto ptr = reinterpret_cast(buffer_info[0]); + size_t length = buffer_info[1]; + long id = buffer_info[2]; + + cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id); + return std::make_unique>(std::move(cb), ptr, + length); + } + + size_t host_read(size_t offset, size_t size, uint8_t *dst) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlong amount_read = + env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast(dst)); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in hostRead"); + } + return amount_read; + } + + size_t size() const override { return ds_size; } + + bool supports_device_read() const override { return device_read_supported; } + + bool is_device_read_preferred(size_t size) const override { + return device_read_supported && size >= device_read_cutoff; + } + + size_t device_read(size_t offset, size_t size, uint8_t *dst, + rmm::cuda_stream_view stream) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlong amount_read = + env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast(dst), + reinterpret_cast(stream.value())); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in deviceRead"); + } + return amount_read; + } + + std::future device_read_async(size_t offset, size_t size, uint8_t *dst, + rmm::cuda_stream_view stream) override { + auto amount_read = device_read(offset, size, dst, stream); + // This is a bit ugly, but we don't have a good way or a need to return + // a future for the read + std::promise ret; + ret.set_value(amount_read); + return ret.get_future(); + } + +private: + size_t ds_size; + bool device_read_supported; + size_t device_read_cutoff; + JavaVM *jvm; + jobject ds; +}; +} // namespace jni +} // namespace cudf + +extern "C" { + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource( + JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported, + jlong device_read_cutoff) { + JNI_NULL_CHECK(env, ds, "Null data source", 0); + try { + cudf::jni::auto_set_device(env); + auto source = + new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff); + return reinterpret_cast(source); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env, + jclass, + jlong handle) { + try { + cudf::jni::auto_set_device(env); + if (handle != 0) { + auto source = reinterpret_cast(handle); + delete (source); + } + } + CATCH_STD(env, ); +} + +} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b208ef8f381..fad19bdf895 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1135,6 +1135,67 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource( + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, + jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote, + jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values, + jlong ds_handle) { + JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL); + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL); + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + NULL); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + NULL); + } + data_types.reserve(n_types.size()); + std::transform(n_types.begin(), n_types.end(), n_scales.begin(), + std::back_inserter(data_types), [](auto type, auto scale) { + return cudf::data_type{static_cast(type), scale}; + }); + } + + cudf::jni::native_jstringArray n_null_values(env, null_values); + cudf::jni::native_jstringArray n_true_values(env, true_values); + cudf::jni::native_jstringArray n_false_values(env, false_values); + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto const quote_style = static_cast(j_quote_style); + + cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source) + .delimiter(delim) + .header(header_row) + .names(n_col_names.as_cpp_vector()) + .dtypes(data_types) + .use_cols_names(n_filter_col_names.as_cpp_vector()) + .true_values(n_true_values.as_cpp_vector()) + .false_values(n_false_values.as_cpp_vector()) + .na_values(n_null_values.as_cpp_vector()) + .keep_default_na(false) + .na_filter(n_null_values.size() > 0) + .quoting(quote_style) + .quotechar(quote) + .comment(comment) + .build(); + + return convert_table_for_return(env, cudf::io::read_csv(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length, @@ -1407,6 +1468,72 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE CATCH_STD(env, nullptr); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, + jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + 0); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + 0); + } + data_types.reserve(n_types.size()); + std::transform(n_types.begin(), n_types.end(), n_scales.begin(), + std::back_inserter(data_types), [](auto const &type, auto const &scale) { + return cudf::data_type{static_cast(type), scale}; + }); + } + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + cudf::io::json_recovery_mode_t recovery_mode = + recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); + + if (!n_col_names.is_null() && data_types.size() > 0) { + if (n_col_names.size() != n_types.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "types and column names must match size", 0); + } + + std::map map; + + auto col_names_vec = n_col_names.as_cpp_vector(); + std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(), + std::inserter(map, map.end()), + [](std::string a, cudf::data_type b) { return std::make_pair(a, b); }); + opts.dtypes(map); + } else if (data_types.size() > 0) { + opts.dtypes(data_types); + } else { + // should infer the types + } + + auto result = + std::make_unique(cudf::io::read_json(opts.build())); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, @@ -1489,6 +1616,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit, + jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto builder = cudf::io::parquet_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + + cudf::io::parquet_reader_options opts = + builder.convert_strings_to_categories(false) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); + return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) { @@ -1535,10 +1692,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source) + .columns(n_filter_col_names.as_cpp_vector()) + .build(); + return convert_table_for_return(env, cudf::io::read_avro(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, - jlong buffer_length, jint unit) { + jlong buffer_length) { const bool read_buffer = (buffer != 0); if (!read_buffer) { @@ -1715,6 +1893,38 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc CATCH_STD(env, ) } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit, + jobjectArray dec128_col_names, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto builder = cudf::io::orc_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + + cudf::io::orc_reader_options opts = + builder.use_index(false) + .use_np_dtypes(static_cast(usingNumPyTypes)) + .timestamp_type(cudf::data_type(static_cast(unit))) + .decimal128_columns(n_dec128_col_names.as_cpp_vector()) + .build(); + return convert_table_for_return(env, cudf::io::read_orc(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) { diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp index 867df80b722..bd82bbd2899 100644 --- a/java/src/main/native/src/cudf_jni_apis.hpp +++ b/java/src/main/native/src/cudf_jni_apis.hpp @@ -134,5 +134,13 @@ void auto_set_device(JNIEnv *env); */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value); +// +// DataSource APIs +// + +bool cache_data_source_jni(JNIEnv *env); + +void release_data_source_jni(JNIEnv *env); + } // namespace jni } // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index faa73ac4322..b0dd4122b0e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -327,6 +327,25 @@ void testReadJSONFile() { } } + @Test + void testReadJSONFromDataSource() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .column(DType.INT32, "age") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("Michael", "Andy", "Justin") + .column(null, 30, 19) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_SIMPLE_JSON_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadJSONFileWithInvalidLines() { Schema schema = Schema.builder() @@ -560,6 +579,126 @@ void testReadCSVBuffer() { } } + byte[][] sliceBytes(byte[] data, int slices) { + slices = Math.min(data.length, slices); + // We are not going to worry about making it super even here. + // The last one gets the extras. + int bytesPerSlice = data.length / slices; + byte[][] ret = new byte[slices][]; + int startingAt = 0; + for (int i = 0; i < (slices - 1); i++) { + ret[i] = new byte[bytesPerSlice]; + System.arraycopy(data, startingAt, ret[i], 0, bytesPerSlice); + startingAt += bytesPerSlice; + } + // Now for the last one + ret[slices - 1] = new byte[data.length - startingAt]; + System.arraycopy(data, startingAt, ret[slices - 1], 0, data.length - startingAt); + return ret; + } + + @Test + void testReadCSVBufferMultiBuffer() { + CSVOptions opts = CSVOptions.builder() + .includeColumn("A") + .includeColumn("B") + .hasHeader() + .withDelim('|') + .withQuote('\'') + .withNullValue("NULL") + .build(); + byte[][] data = sliceBytes(CSV_DATA_BUFFER, 10); + try (Table expected = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8) + .build(); + MultiBufferDataSource source = sourceFrom(data); + Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + + public static byte[] arrayFrom(File f) throws IOException { + long len = f.length(); + if (len > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Sorry cannot read " + f + + " into an array it does not fit"); + } + int remaining = (int)len; + byte[] ret = new byte[remaining]; + try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) { + int at = 0; + while (remaining > 0) { + int amount = fin.read(ret, at, remaining); + at += amount; + remaining -= amount; + } + } + return ret; + } + + public static MultiBufferDataSource sourceFrom(File f) throws IOException { + long len = f.length(); + byte[] tmp = new byte[(int)Math.min(32 * 1024, len)]; + try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) { + try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) { + long at = 0; + while (at < len) { + int amount = fin.read(tmp); + buffer.setBytes(at, tmp, 0, amount); + at += amount; + } + } + return new MultiBufferDataSource(buffer); + } + } + + public static MultiBufferDataSource sourceFrom(byte[] data) { + long len = data.length; + try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) { + buffer.setBytes(0, data, 0, len); + return new MultiBufferDataSource(buffer); + } + } + + public static MultiBufferDataSource sourceFrom(byte[][] data) { + HostMemoryBuffer[] buffers = new HostMemoryBuffer[data.length]; + try { + for (int i = 0; i < data.length; i++) { + byte[] subData = data[i]; + buffers[i] = HostMemoryBuffer.allocate(subData.length); + buffers[i].setBytes(0, subData, 0, subData.length); + } + return new MultiBufferDataSource(buffers); + } finally { + for (HostMemoryBuffer buffer: buffers) { + if (buffer != null) { + buffer.close(); + } + } + } + } + + @Test + void testReadCSVDataSource() { + CSVOptions opts = CSVOptions.builder() + .includeColumn("A") + .includeColumn("B") + .hasHeader() + .withDelim('|') + .withQuote('\'') + .withNullValue("NULL") + .build(); + try (Table expected = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8) + .build(); + MultiBufferDataSource source = sourceFrom(TableTest.CSV_DATA_BUFFER); + Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadCSVWithOffset() { CSVOptions opts = CSVOptions.builder() @@ -864,6 +1003,37 @@ void testReadParquet() { } } + @Test + void testReadParquetFromDataSource() throws IOException { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("loan_id") + .includeColumn("zip") + .includeColumn("num_units") + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE); + Table table = Table.readParquet(opts, source)) { + long rows = table.getRowCount(); + assertEquals(1000, rows); + assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table); + } + } + + @Test + void testReadParquetMultiBuffer() throws IOException { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("loan_id") + .includeColumn("zip") + .includeColumn("num_units") + .build(); + byte [][] data = sliceBytes(arrayFrom(TEST_PARQUET_FILE), 10); + try (MultiBufferDataSource source = sourceFrom(data); + Table table = Table.readParquet(opts, source)) { + long rows = table.getRowCount(); + assertEquals(1000, rows); + assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table); + } + } + @Test void testReadParquetBinary() { ParquetOptions opts = ParquetOptions.builder() @@ -1018,6 +1188,23 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetFromDataSource() throws IOException { + try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); + ParquetChunkedReader reader = new ParquetChunkedReader(240000, ParquetOptions.DEFAULT, source)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + @Test void testReadAvro() { AvroOptions opts = AvroOptions.builder() @@ -1037,6 +1224,26 @@ void testReadAvro() { } } + @Test + void testReadAvroFromDataSource() throws IOException { + AvroOptions opts = AvroOptions.builder() + .includeColumn("bool_col") + .includeColumn("int_col") + .includeColumn("timestamp_col") + .build(); + + try (Table expected = new Table.TestBuilder() + .column(true, false, true, false, true, false, true, false) + .column(0, 1, 0, 1, 0, 1, 0, 1) + .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L, + 1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_ALL_TYPES_PLAIN_AVRO_FILE); + Table table = Table.readAvro(opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadAvroBuffer() throws IOException{ AvroOptions opts = AvroOptions.builder() @@ -1094,6 +1301,24 @@ void testReadORC() { } } + @Test + void testReadORCFromDataSource() throws IOException { + ORCOptions opts = ORCOptions.builder() + .includeColumn("string1") + .includeColumn("float1") + .includeColumn("int1") + .build(); + try (Table expected = new Table.TestBuilder() + .column("hi","bye") + .column(1.0f,2.0f) + .column(65536,65536) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_ORC_FILE); + Table table = Table.readORC(opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadORCBuffer() throws IOException { ORCOptions opts = ORCOptions.builder() From 15baa00693ab4aa59f99ccb417c613880789d047 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 11 Oct 2023 11:31:39 +0200 Subject: [PATCH 10/11] Fixes behaviour for incomplete lines when `recover_with_nulls` is enabled (#14252) Closes https://github.com/rapidsai/cudf/issues/14227. Adapts the behaviour of the JSON finite-state transducer (FST) when `recover_with_nulls` is `true` to be more strict and reject lines that contain incomplete JSON objects (aka records) or JSON arrays (aka lists). Authors: - Elias Stehle (https://github.com/elstehle) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14252 --- cpp/src/io/json/nested_json_gpu.cu | 703 +++++++++++++++-------------- cpp/tests/io/json_test.cpp | 45 +- cpp/tests/io/nested_json_test.cpp | 23 +- 3 files changed, 401 insertions(+), 370 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 06ac11485cb..c9107357239 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -660,13 +660,13 @@ auto get_transition_table(json_format_cfg_t format) PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON}; pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR}; pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR, @@ -680,9 +680,9 @@ auto get_transition_table(json_format_cfg_t format) PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_PFN)] = { PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, @@ -697,8 +697,11 @@ auto get_transition_table(json_format_cfg_t format) /** * @brief Getting the translation table + * @param recover_from_error Whether or not the tokenizer should recover from invalid lines. If + * `recover_from_error` is true, invalid JSON lines end with the token sequence (`ErrorBegin`, + * `LineEn`) and incomplete JSON lines (e.g., `{"a":123\n`) are treated as invalid lines. */ -auto get_translation_table(bool include_line_delimiter) +auto get_translation_table(bool recover_from_error) { constexpr auto StructBegin = token_t::StructBegin; constexpr auto StructEnd = token_t::StructEnd; @@ -715,76 +718,83 @@ auto get_translation_table(bool include_line_delimiter) constexpr auto ErrorBegin = token_t::ErrorBegin; /** - * @brief Appends token_t::LineEnd token to the given token sequence, if and only if - * `include_line_delimiter` is true. + * @brief Instead of specifying the verbose translation tables twice (i.e., once when + * `recover_from_error` is true and once when it is false), we use `nl_tokens` to specialize the + * translation table where it differs depending on the `recover_from_error` option. If and only if + * `recover_from_error` is true, `recovering_tokens` are returned along with a token_t::LineEnd + * token, otherwise `regular_tokens` is returned. */ - auto nl_tokens = [include_line_delimiter](std::vector tokens) { - if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); } - return tokens; + auto nl_tokens = [recover_from_error](std::vector regular_tokens, + std::vector recovering_tokens) { + if (recover_from_error) { + recovering_tokens.push_back(token_t::LineEnd); + return recovering_tokens; + } + return regular_tokens; }; std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {ValueBegin}, // OTHER + /*LIST*/ {StructBegin}, // OPENING_BRACE {ListBegin}, // OPENING_BRACKET {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET + {ListEnd}, // CLOSING_BRACKET {StringBegin}, // QUOTE {ErrorBegin}, // ESCAPE {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER - /*LIST*/ + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ {StructBegin}, // OPENING_BRACE {ListBegin}, // OPENING_BRACKET {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // CLOSING_BRACKET {StringBegin}, // QUOTE {ErrorBegin}, // ESCAPE {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER - /*STRUCT*/ - {StructBegin}, // OPENING_BRACE - {ListBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {StringBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}}}; // OTHER + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BOA)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {StructBegin}, // OPENING_BRACE - {ListBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {StringBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -795,33 +805,33 @@ auto get_translation_table(bool include_line_delimiter) {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_LON)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ValueEnd, ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -832,108 +842,108 @@ auto get_translation_table(bool include_line_delimiter) {ValueEnd, StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK {}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PVL)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -944,34 +954,34 @@ auto get_translation_table(bool include_line_delimiter) {StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BFN)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -982,156 +992,159 @@ auto get_translation_table(bool include_line_delimiter) {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_FLN)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}}}; // OTHER return pda_tlt; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 7c911ac2e04..2ddb0b76544 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1962,7 +1962,31 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) "\n" "\n" // 4 -> a: 123 (valid) - R"({"a":123})"; + R"({"a":4})" + "\n" + // 5 -> (invalid) + R"({"a":5)" + "\n" + // 6 -> (invalid) + R"({"a":6 )" + "\n" + // 7 -> (invalid) + R"({"b":[7 )" + "\n" + // 8 -> a: 8 (valid) + R"({"a":8})" + "\n" + // 9 -> (invalid) + R"({"d":{"unterminated_field_name)" + "\n" + // 10 -> (invalid) + R"({"d":{)" + "\n" + // 11 -> (invalid) + R"({"d":{"123",)" + "\n" + // 12 -> a: 12 (valid) + R"({"a":12})"; auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json"; { @@ -1978,17 +2002,22 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) cudf::io::table_with_metadata result = cudf::io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 2); - EXPECT_EQ(result.tbl->num_rows(), 5); + EXPECT_EQ(result.tbl->num_rows(), 13); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - std::vector a_validity{true, false, false, false, true}; - std::vector c_validity{false, false, false, true, false}; + std::vector a_validity{ + true, false, false, false, true, false, false, false, true, false, false, false, true}; + std::vector c_validity{ + false, false, false, true, false, false, false, false, false, false, false, false, false}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(0), + int64_wrapper{{-2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 12}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + c_validity.cbegin()}); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 00d657108b8..3cb7e1f287a 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -569,23 +569,12 @@ TEST_F(JsonTest, RecoveringTokenStream) // Line 0 (invalid) {0, token_t::StructBegin}, {0, token_t::StructEnd}, - // Line 1 (valid) - {10, token_t::StructBegin}, - {11, token_t::StructMemberBegin}, - {11, token_t::FieldNameBegin}, - {13, token_t::FieldNameEnd}, - // Line 2 (valid) - {16, token_t::StructBegin}, - {17, token_t::StructMemberBegin}, - {17, token_t::FieldNameBegin}, - {19, token_t::FieldNameEnd}, - {21, token_t::StructBegin}, - {22, token_t::StructMemberBegin}, - {22, token_t::FieldNameBegin}, - {24, token_t::FieldNameEnd}, - {26, token_t::ListBegin}, - {27, token_t::ValueBegin}, - {30, token_t::ValueEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, // Line 3 (valid) {31, token_t::StructBegin}, {32, token_t::StructMemberBegin}, From 135879368a8fcecda0a1d85bcf18b7e15cd0269d Mon Sep 17 00:00:00 2001 From: Ray Douglass Date: Wed, 11 Oct 2023 10:28:59 -0400 Subject: [PATCH 11/11] Update Changelog [skip ci] --- CHANGELOG.md | 262 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76abf241d96..ecd547ab5b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,265 @@ +# cuDF 23.10.00 (11 Oct 2023) + +## 🚨 Breaking Changes + +- Expose stream parameter in public nvtext ngram APIs ([#14061](https://github.com/rapidsai/cudf/pull/14061)) [@davidwendt](https://github.com/davidwendt) +- Raise `MixedTypeError` when a column of mixed-dtype is being constructed ([#14050](https://github.com/rapidsai/cudf/pull/14050)) [@galipremsagar](https://github.com/galipremsagar) +- Raise `NotImplementedError` for `MultiIndex.to_series` ([#14049](https://github.com/rapidsai/cudf/pull/14049)) [@galipremsagar](https://github.com/galipremsagar) +- Create table_input_metadata from a table_metadata ([#13920](https://github.com/rapidsai/cudf/pull/13920)) [@etseidl](https://github.com/etseidl) +- Enable RLE boolean encoding for v2 Parquet files ([#13886](https://github.com/rapidsai/cudf/pull/13886)) [@etseidl](https://github.com/etseidl) +- Change `NA` to `NaT` for `datetime` and `timedelta` types ([#13868](https://github.com/rapidsai/cudf/pull/13868)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `any`, `all` reduction behavior for `axis=None` and warn for other reductions ([#13831](https://github.com/rapidsai/cudf/pull/13831)) [@galipremsagar](https://github.com/galipremsagar) +- Add minhash support for MurmurHash3_x64_128 ([#13796](https://github.com/rapidsai/cudf/pull/13796)) [@davidwendt](https://github.com/davidwendt) +- Remove the libcudf cudf::offset_type type ([#13788](https://github.com/rapidsai/cudf/pull/13788)) [@davidwendt](https://github.com/davidwendt) +- Raise error when trying to join `datetime` and `timedelta` types with other types ([#13786](https://github.com/rapidsai/cudf/pull/13786)) [@galipremsagar](https://github.com/galipremsagar) +- Update to Cython 3.0.0 ([#13777](https://github.com/rapidsai/cudf/pull/13777)) [@vyasr](https://github.com/vyasr) +- Raise error on constructing an array from mixed type inputs ([#13768](https://github.com/rapidsai/cudf/pull/13768)) [@galipremsagar](https://github.com/galipremsagar) +- Enforce deprecations in `23.10` ([#13732](https://github.com/rapidsai/cudf/pull/13732)) [@galipremsagar](https://github.com/galipremsagar) +- Upgrade to arrow 12 ([#13728](https://github.com/rapidsai/cudf/pull/13728)) [@galipremsagar](https://github.com/galipremsagar) +- Remove Arrow dependency from the `datasource.hpp` public header ([#13698](https://github.com/rapidsai/cudf/pull/13698)) [@vuule](https://github.com/vuule) + +## 🐛 Bug Fixes + +- Fix inaccurate ceil/floor and inaccurate rescaling casts of fixed-point values. ([#14242](https://github.com/rapidsai/cudf/pull/14242)) [@bdice](https://github.com/bdice) +- Fix inaccuracy in decimal128 rounding. ([#14233](https://github.com/rapidsai/cudf/pull/14233)) [@bdice](https://github.com/bdice) +- Workaround for illegal instruction error in sm90 for warp instrinsics with mask ([#14201](https://github.com/rapidsai/cudf/pull/14201)) [@karthikeyann](https://github.com/karthikeyann) +- Fix pytorch related pytest ([#14198](https://github.com/rapidsai/cudf/pull/14198)) [@galipremsagar](https://github.com/galipremsagar) +- Pin to `aws-sdk-cpp<1.11` ([#14173](https://github.com/rapidsai/cudf/pull/14173)) [@pentschev](https://github.com/pentschev) +- Fix assert failure for range window functions ([#14168](https://github.com/rapidsai/cudf/pull/14168)) [@mythrocks](https://github.com/mythrocks) +- Fix Memcheck error found in JSON_TEST JsonReaderTest.ErrorStrings ([#14164](https://github.com/rapidsai/cudf/pull/14164)) [@karthikeyann](https://github.com/karthikeyann) +- Fix calls to copy_bitmask to pass stream parameter ([#14158](https://github.com/rapidsai/cudf/pull/14158)) [@davidwendt](https://github.com/davidwendt) +- Fix DataFrame from Series with different CategoricalIndexes ([#14157](https://github.com/rapidsai/cudf/pull/14157)) [@mroeschke](https://github.com/mroeschke) +- Pin to numpy<1.25 and numba<0.58 to avoid errors and deprecation warnings-as-errors. ([#14156](https://github.com/rapidsai/cudf/pull/14156)) [@bdice](https://github.com/bdice) +- Fix kernel launch error for cudf::io::orc::gpu::rowgroup_char_counts_kernel ([#14139](https://github.com/rapidsai/cudf/pull/14139)) [@davidwendt](https://github.com/davidwendt) +- Don't sort columns for DataFrame init from list of Series ([#14136](https://github.com/rapidsai/cudf/pull/14136)) [@mroeschke](https://github.com/mroeschke) +- Fix DataFrame.values with no columns but index ([#14134](https://github.com/rapidsai/cudf/pull/14134)) [@mroeschke](https://github.com/mroeschke) +- Avoid circular cimports in _lib/cpp/reduce.pxd ([#14125](https://github.com/rapidsai/cudf/pull/14125)) [@vyasr](https://github.com/vyasr) +- Add support for nested dict in `DataFrame` constructor ([#14119](https://github.com/rapidsai/cudf/pull/14119)) [@galipremsagar](https://github.com/galipremsagar) +- Restrict iterables of `DataFrame`'s as input to `DataFrame` constructor ([#14118](https://github.com/rapidsai/cudf/pull/14118)) [@galipremsagar](https://github.com/galipremsagar) +- Allow `numeric_only=True` for reduction operations on numeric types ([#14111](https://github.com/rapidsai/cudf/pull/14111)) [@galipremsagar](https://github.com/galipremsagar) +- Preserve name of the column while initializing a `DataFrame` ([#14110](https://github.com/rapidsai/cudf/pull/14110)) [@galipremsagar](https://github.com/galipremsagar) +- Correct numerous 20054-D: dynamic initialization errors found on arm+12.2 ([#14108](https://github.com/rapidsai/cudf/pull/14108)) [@robertmaynard](https://github.com/robertmaynard) +- Drop `kwargs` from `Series.count` ([#14106](https://github.com/rapidsai/cudf/pull/14106)) [@galipremsagar](https://github.com/galipremsagar) +- Fix naming issues with `Index.to_frame` and `MultiIndex.to_frame` APIs ([#14105](https://github.com/rapidsai/cudf/pull/14105)) [@galipremsagar](https://github.com/galipremsagar) +- Only use memory resources that haven't been freed ([#14103](https://github.com/rapidsai/cudf/pull/14103)) [@robertmaynard](https://github.com/robertmaynard) +- Add support for `__round__` in `Series` and `DataFrame` ([#14099](https://github.com/rapidsai/cudf/pull/14099)) [@galipremsagar](https://github.com/galipremsagar) +- Validate ignore_index type in drop_duplicates ([#14098](https://github.com/rapidsai/cudf/pull/14098)) [@mroeschke](https://github.com/mroeschke) +- Fix renaming `Series` and `Index` ([#14080](https://github.com/rapidsai/cudf/pull/14080)) [@galipremsagar](https://github.com/galipremsagar) +- Raise NotImplementedError in to_datetime if Z (or tz component) in string ([#14074](https://github.com/rapidsai/cudf/pull/14074)) [@mroeschke](https://github.com/mroeschke) +- Raise NotImplementedError for datetime strings with UTC offset ([#14070](https://github.com/rapidsai/cudf/pull/14070)) [@mroeschke](https://github.com/mroeschke) +- Update pyarrow-related dispatch logic in dask_cudf ([#14069](https://github.com/rapidsai/cudf/pull/14069)) [@rjzamora](https://github.com/rjzamora) +- Use `conda mambabuild` rather than `mamba mambabuild` ([#14067](https://github.com/rapidsai/cudf/pull/14067)) [@wence-](https://github.com/wence-) +- Raise NotImplementedError in to_datetime with dayfirst without infer_format ([#14058](https://github.com/rapidsai/cudf/pull/14058)) [@mroeschke](https://github.com/mroeschke) +- Fix various issues in `Index.intersection` ([#14054](https://github.com/rapidsai/cudf/pull/14054)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `Index.difference` to match with pandas ([#14053](https://github.com/rapidsai/cudf/pull/14053)) [@galipremsagar](https://github.com/galipremsagar) +- Fix empty string column construction ([#14052](https://github.com/rapidsai/cudf/pull/14052)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `IntervalIndex.union` to preserve type-metadata ([#14051](https://github.com/rapidsai/cudf/pull/14051)) [@galipremsagar](https://github.com/galipremsagar) +- Raise `MixedTypeError` when a column of mixed-dtype is being constructed ([#14050](https://github.com/rapidsai/cudf/pull/14050)) [@galipremsagar](https://github.com/galipremsagar) +- Raise `NotImplementedError` for `MultiIndex.to_series` ([#14049](https://github.com/rapidsai/cudf/pull/14049)) [@galipremsagar](https://github.com/galipremsagar) +- Ignore compile_commands.json ([#14048](https://github.com/rapidsai/cudf/pull/14048)) [@harrism](https://github.com/harrism) +- Raise TypeError for any non-parseable argument in to_datetime ([#14044](https://github.com/rapidsai/cudf/pull/14044)) [@mroeschke](https://github.com/mroeschke) +- Raise NotImplementedError for to_datetime with z format ([#14037](https://github.com/rapidsai/cudf/pull/14037)) [@mroeschke](https://github.com/mroeschke) +- Implement `sort_remaining` for `sort_index` ([#14033](https://github.com/rapidsai/cudf/pull/14033)) [@wence-](https://github.com/wence-) +- Raise NotImplementedError for Categoricals with timezones ([#14032](https://github.com/rapidsai/cudf/pull/14032)) [@mroeschke](https://github.com/mroeschke) +- Temporary fix Parquet metadata with empty value string being ignored from writing ([#14026](https://github.com/rapidsai/cudf/pull/14026)) [@ttnghia](https://github.com/ttnghia) +- Preserve types of scalar being returned when possible in `quantile` ([#14014](https://github.com/rapidsai/cudf/pull/14014)) [@galipremsagar](https://github.com/galipremsagar) +- Fix return type of `MultiIndex.difference` ([#14009](https://github.com/rapidsai/cudf/pull/14009)) [@galipremsagar](https://github.com/galipremsagar) +- Raise an error when timezone subtypes are encountered in `pd.IntervalDtype` ([#14006](https://github.com/rapidsai/cudf/pull/14006)) [@galipremsagar](https://github.com/galipremsagar) +- Fix map column can not be non-nullable for java ([#14003](https://github.com/rapidsai/cudf/pull/14003)) [@res-life](https://github.com/res-life) +- Fix `name` selection in `Index.difference` and `Index.intersection` ([#13986](https://github.com/rapidsai/cudf/pull/13986)) [@galipremsagar](https://github.com/galipremsagar) +- Restore column type metadata with `dropna` to fix `factorize` API ([#13980](https://github.com/rapidsai/cudf/pull/13980)) [@galipremsagar](https://github.com/galipremsagar) +- Use thread_index_type to avoid out of bounds accesses in conditional joins ([#13971](https://github.com/rapidsai/cudf/pull/13971)) [@vyasr](https://github.com/vyasr) +- Fix `MultiIndex.to_numpy` to return numpy array with tuples ([#13966](https://github.com/rapidsai/cudf/pull/13966)) [@galipremsagar](https://github.com/galipremsagar) +- Use cudf::thread_index_type in get_json_object and tdigest kernels ([#13962](https://github.com/rapidsai/cudf/pull/13962)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix an issue with `IntervalIndex.repr` when null values are present ([#13958](https://github.com/rapidsai/cudf/pull/13958)) [@galipremsagar](https://github.com/galipremsagar) +- Fix type metadata issue preservation with `Column.unique` ([#13957](https://github.com/rapidsai/cudf/pull/13957)) [@galipremsagar](https://github.com/galipremsagar) +- Handle `Interval` scalars when passed in list-like inputs to `cudf.Index` ([#13956](https://github.com/rapidsai/cudf/pull/13956)) [@galipremsagar](https://github.com/galipremsagar) +- Fix setting of categories order when `dtype` is passed to a `CategoricalColumn` ([#13955](https://github.com/rapidsai/cudf/pull/13955)) [@galipremsagar](https://github.com/galipremsagar) +- Handle `as_index` in `GroupBy.apply` ([#13951](https://github.com/rapidsai/cudf/pull/13951)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Raise error for string types in `nsmallest` and `nlargest` ([#13946](https://github.com/rapidsai/cudf/pull/13946)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `index` of `Groupby.apply` results when it is performed on empty objects ([#13944](https://github.com/rapidsai/cudf/pull/13944)) [@galipremsagar](https://github.com/galipremsagar) +- Fix integer overflow in shim `device_sum` functions ([#13943](https://github.com/rapidsai/cudf/pull/13943)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix type mismatch in groupby reduction for empty objects ([#13942](https://github.com/rapidsai/cudf/pull/13942)) [@galipremsagar](https://github.com/galipremsagar) +- Fixed processed bytes calculation in APPLY_BOOLEAN_MASK benchmark. ([#13937](https://github.com/rapidsai/cudf/pull/13937)) [@Blonck](https://github.com/Blonck) +- Fix construction of `Grouping` objects ([#13932](https://github.com/rapidsai/cudf/pull/13932)) [@galipremsagar](https://github.com/galipremsagar) +- Fix an issue with `loc` when column names is `MultiIndex` ([#13929](https://github.com/rapidsai/cudf/pull/13929)) [@galipremsagar](https://github.com/galipremsagar) +- Fix handling of typecasting in `searchsorted` ([#13925](https://github.com/rapidsai/cudf/pull/13925)) [@galipremsagar](https://github.com/galipremsagar) +- Preserve index `name` in `reindex` ([#13917](https://github.com/rapidsai/cudf/pull/13917)) [@galipremsagar](https://github.com/galipremsagar) +- Use `cudf::thread_index_type` in cuIO to prevent overflow in row indexing ([#13910](https://github.com/rapidsai/cudf/pull/13910)) [@vuule](https://github.com/vuule) +- Fix for encodings listed in the Parquet column chunk metadata ([#13907](https://github.com/rapidsai/cudf/pull/13907)) [@etseidl](https://github.com/etseidl) +- Use cudf::thread_index_type in concatenate.cu. ([#13906](https://github.com/rapidsai/cudf/pull/13906)) [@bdice](https://github.com/bdice) +- Use cudf::thread_index_type in replace.cu. ([#13905](https://github.com/rapidsai/cudf/pull/13905)) [@bdice](https://github.com/bdice) +- Add noSanitizer tag to Java reduction tests failing with sanitizer in CUDA 12 ([#13904](https://github.com/rapidsai/cudf/pull/13904)) [@jlowe](https://github.com/jlowe) +- Remove the internal use of the cudf's default stream in cuIO ([#13903](https://github.com/rapidsai/cudf/pull/13903)) [@vuule](https://github.com/vuule) +- Use cuda-nvtx-dev CUDA 12 package. ([#13901](https://github.com/rapidsai/cudf/pull/13901)) [@bdice](https://github.com/bdice) +- Use `thread_index_type` to avoid index overflow in grid-stride loops ([#13895](https://github.com/rapidsai/cudf/pull/13895)) [@PointKernel](https://github.com/PointKernel) +- Fix memory access error in cudf::shift for sliced strings ([#13894](https://github.com/rapidsai/cudf/pull/13894)) [@davidwendt](https://github.com/davidwendt) +- Raise error when trying to construct a `DataFrame` with mixed types ([#13889](https://github.com/rapidsai/cudf/pull/13889)) [@galipremsagar](https://github.com/galipremsagar) +- Return `nan` when one variable to be correlated has zero variance in JIT GroupBy Apply ([#13884](https://github.com/rapidsai/cudf/pull/13884)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Correctly detect the BOM mark in `read_csv` with compressed input ([#13881](https://github.com/rapidsai/cudf/pull/13881)) [@vuule](https://github.com/vuule) +- Check for the presence of all values in `MultiIndex.isin` ([#13879](https://github.com/rapidsai/cudf/pull/13879)) [@galipremsagar](https://github.com/galipremsagar) +- Fix nvtext::generate_character_ngrams performance regression for longer strings ([#13874](https://github.com/rapidsai/cudf/pull/13874)) [@davidwendt](https://github.com/davidwendt) +- Fix return type of `MultiIndex.levels` ([#13870](https://github.com/rapidsai/cudf/pull/13870)) [@galipremsagar](https://github.com/galipremsagar) +- Fix List's missing children metadata in JSON writer ([#13869](https://github.com/rapidsai/cudf/pull/13869)) [@karthikeyann](https://github.com/karthikeyann) +- Disable construction of Index when `freq` is set in pandas-compatibility mode ([#13857](https://github.com/rapidsai/cudf/pull/13857)) [@galipremsagar](https://github.com/galipremsagar) +- Fix an issue with fetching `NA` from a `TimedeltaColumn` ([#13853](https://github.com/rapidsai/cudf/pull/13853)) [@galipremsagar](https://github.com/galipremsagar) +- Simplify implementation of interval_range() and fix behaviour for floating `freq` ([#13844](https://github.com/rapidsai/cudf/pull/13844)) [@shwina](https://github.com/shwina) +- Fix binary operations between `Series` and `Index` ([#13842](https://github.com/rapidsai/cudf/pull/13842)) [@galipremsagar](https://github.com/galipremsagar) +- Update make_lists_column_from_scalar to use make_offsets_child_column utility ([#13841](https://github.com/rapidsai/cudf/pull/13841)) [@davidwendt](https://github.com/davidwendt) +- Fix read out of bounds in string concatenate ([#13838](https://github.com/rapidsai/cudf/pull/13838)) [@pentschev](https://github.com/pentschev) +- Raise error for more cases when `timezone-aware` data is passed to `as_column` ([#13835](https://github.com/rapidsai/cudf/pull/13835)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `any`, `all` reduction behavior for `axis=None` and warn for other reductions ([#13831](https://github.com/rapidsai/cudf/pull/13831)) [@galipremsagar](https://github.com/galipremsagar) +- Raise error when trying to construct time-zone aware timestamps ([#13830](https://github.com/rapidsai/cudf/pull/13830)) [@galipremsagar](https://github.com/galipremsagar) +- Fix cuFile I/O factories ([#13829](https://github.com/rapidsai/cudf/pull/13829)) [@vuule](https://github.com/vuule) +- DataFrame with namedtuples uses ._field as column names ([#13824](https://github.com/rapidsai/cudf/pull/13824)) [@mroeschke](https://github.com/mroeschke) +- Branch 23.10 merge 23.08 ([#13822](https://github.com/rapidsai/cudf/pull/13822)) [@vyasr](https://github.com/vyasr) +- Return a Series from JIT GroupBy apply, rather than a DataFrame ([#13820](https://github.com/rapidsai/cudf/pull/13820)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- No need to dlsym EnsureS3Finalized we can call it directly ([#13819](https://github.com/rapidsai/cudf/pull/13819)) [@robertmaynard](https://github.com/robertmaynard) +- Raise error when mixed types are being constructed ([#13816](https://github.com/rapidsai/cudf/pull/13816)) [@galipremsagar](https://github.com/galipremsagar) +- Fix unbounded sequence issue in `DataFrame` constructor ([#13811](https://github.com/rapidsai/cudf/pull/13811)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Byte-Pair-Encoding usage of cuco static-map for storing merge-pairs ([#13807](https://github.com/rapidsai/cudf/pull/13807)) [@davidwendt](https://github.com/davidwendt) +- Fix for Parquet writer when requested pages per row is smaller than fragment size ([#13806](https://github.com/rapidsai/cudf/pull/13806)) [@etseidl](https://github.com/etseidl) +- Remove hangs from trying to construct un-bounded sequences ([#13799](https://github.com/rapidsai/cudf/pull/13799)) [@galipremsagar](https://github.com/galipremsagar) +- Bug/update libcudf to handle arrow12 changes ([#13794](https://github.com/rapidsai/cudf/pull/13794)) [@robertmaynard](https://github.com/robertmaynard) +- Update get_arrow to arrows 12 CMake target name of arrow::xsimd ([#13790](https://github.com/rapidsai/cudf/pull/13790)) [@robertmaynard](https://github.com/robertmaynard) +- Raise error when trying to join `datetime` and `timedelta` types with other types ([#13786](https://github.com/rapidsai/cudf/pull/13786)) [@galipremsagar](https://github.com/galipremsagar) +- Fix negative unary operation for boolean type ([#13780](https://github.com/rapidsai/cudf/pull/13780)) [@galipremsagar](https://github.com/galipremsagar) +- Fix contains(`in`) method for `Series` ([#13779](https://github.com/rapidsai/cudf/pull/13779)) [@galipremsagar](https://github.com/galipremsagar) +- Fix binary operation column ordering and missing column issues ([#13778](https://github.com/rapidsai/cudf/pull/13778)) [@galipremsagar](https://github.com/galipremsagar) +- Cast only time of day to nanos to avoid an overflow in Parquet INT96 write ([#13776](https://github.com/rapidsai/cudf/pull/13776)) [@gerashegalov](https://github.com/gerashegalov) +- Preserve names of column object in various APIs ([#13772](https://github.com/rapidsai/cudf/pull/13772)) [@galipremsagar](https://github.com/galipremsagar) +- Raise error on constructing an array from mixed type inputs ([#13768](https://github.com/rapidsai/cudf/pull/13768)) [@galipremsagar](https://github.com/galipremsagar) +- Fix construction of DataFrames from dict when columns are provided ([#13766](https://github.com/rapidsai/cudf/pull/13766)) [@wence-](https://github.com/wence-) +- Provide our own Cython declaration for make_unique ([#13746](https://github.com/rapidsai/cudf/pull/13746)) [@wence-](https://github.com/wence-) + +## 📖 Documentation + +- Fix typo in docstring: metadata. ([#14025](https://github.com/rapidsai/cudf/pull/14025)) [@bdice](https://github.com/bdice) +- Fix typo in parquet/page_decode.cuh ([#13849](https://github.com/rapidsai/cudf/pull/13849)) [@XinyuZeng](https://github.com/XinyuZeng) +- Simplify Python doc configuration ([#13826](https://github.com/rapidsai/cudf/pull/13826)) [@vyasr](https://github.com/vyasr) +- Update documentation to reflect recent changes in JSON reader and writer ([#13791](https://github.com/rapidsai/cudf/pull/13791)) [@vuule](https://github.com/vuule) +- Fix all warnings in Python docs ([#13789](https://github.com/rapidsai/cudf/pull/13789)) [@vyasr](https://github.com/vyasr) + +## 🚀 New Features + +- [Java] Add JNI bindings for `integers_to_hex` ([#14205](https://github.com/rapidsai/cudf/pull/14205)) [@razajafri](https://github.com/razajafri) +- Propagate errors from Parquet reader kernels back to host ([#14167](https://github.com/rapidsai/cudf/pull/14167)) [@vuule](https://github.com/vuule) +- JNI for `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations ([#14154](https://github.com/rapidsai/cudf/pull/14154)) [@ttnghia](https://github.com/ttnghia) +- Expose streams in all public sorting APIs ([#14146](https://github.com/rapidsai/cudf/pull/14146)) [@vyasr](https://github.com/vyasr) +- Enable direct ingestion and production of Arrow scalars ([#14121](https://github.com/rapidsai/cudf/pull/14121)) [@vyasr](https://github.com/vyasr) +- Implement `GroupBy.value_counts` to match pandas API ([#14114](https://github.com/rapidsai/cudf/pull/14114)) [@stmio](https://github.com/stmio) +- Refactor parquet thrift reader ([#14097](https://github.com/rapidsai/cudf/pull/14097)) [@etseidl](https://github.com/etseidl) +- Refactor `hash_reduce_by_row` ([#14095](https://github.com/rapidsai/cudf/pull/14095)) [@ttnghia](https://github.com/ttnghia) +- Support negative preceding/following for ROW window functions ([#14093](https://github.com/rapidsai/cudf/pull/14093)) [@mythrocks](https://github.com/mythrocks) +- Support for progressive parquet chunked reading. ([#14079](https://github.com/rapidsai/cudf/pull/14079)) [@nvdbaranec](https://github.com/nvdbaranec) +- Implement `HISTOGRAM` and `MERGE_HISTOGRAM` aggregations ([#14045](https://github.com/rapidsai/cudf/pull/14045)) [@ttnghia](https://github.com/ttnghia) +- Expose streams in public search APIs ([#14034](https://github.com/rapidsai/cudf/pull/14034)) [@vyasr](https://github.com/vyasr) +- Expose streams in public replace APIs ([#14010](https://github.com/rapidsai/cudf/pull/14010)) [@vyasr](https://github.com/vyasr) +- Add stream parameter to public cudf::strings::split APIs ([#13997](https://github.com/rapidsai/cudf/pull/13997)) [@davidwendt](https://github.com/davidwendt) +- Expose streams in public filling APIs ([#13990](https://github.com/rapidsai/cudf/pull/13990)) [@vyasr](https://github.com/vyasr) +- Expose streams in public concatenate APIs ([#13987](https://github.com/rapidsai/cudf/pull/13987)) [@vyasr](https://github.com/vyasr) +- Use HostMemoryAllocator in jni::allocate_host_buffer ([#13975](https://github.com/rapidsai/cudf/pull/13975)) [@gerashegalov](https://github.com/gerashegalov) +- Enable fractional null probability for hashing benchmark ([#13967](https://github.com/rapidsai/cudf/pull/13967)) [@Blonck](https://github.com/Blonck) +- Switch pylibcudf-enabled types to use enum class in Cython ([#13931](https://github.com/rapidsai/cudf/pull/13931)) [@vyasr](https://github.com/vyasr) +- Add nvtext::tokenize_with_vocabulary API ([#13930](https://github.com/rapidsai/cudf/pull/13930)) [@davidwendt](https://github.com/davidwendt) +- Rewrite `DataFrame.stack` to support multi level column names ([#13927](https://github.com/rapidsai/cudf/pull/13927)) [@isVoid](https://github.com/isVoid) +- Add HostMemoryAllocator interface ([#13924](https://github.com/rapidsai/cudf/pull/13924)) [@gerashegalov](https://github.com/gerashegalov) +- Global stream pool ([#13922](https://github.com/rapidsai/cudf/pull/13922)) [@etseidl](https://github.com/etseidl) +- Create table_input_metadata from a table_metadata ([#13920](https://github.com/rapidsai/cudf/pull/13920)) [@etseidl](https://github.com/etseidl) +- Translate column size overflow exception to JNI ([#13911](https://github.com/rapidsai/cudf/pull/13911)) [@mythrocks](https://github.com/mythrocks) +- Enable RLE boolean encoding for v2 Parquet files ([#13886](https://github.com/rapidsai/cudf/pull/13886)) [@etseidl](https://github.com/etseidl) +- Exclude some tests from running with the compute sanitizer ([#13872](https://github.com/rapidsai/cudf/pull/13872)) [@firestarman](https://github.com/firestarman) +- Expand statistics support in ORC writer ([#13848](https://github.com/rapidsai/cudf/pull/13848)) [@vuule](https://github.com/vuule) +- Register the memory mapped buffer in `datasource` to improve H2D throughput ([#13814](https://github.com/rapidsai/cudf/pull/13814)) [@vuule](https://github.com/vuule) +- Add cudf::strings::find function with target per row ([#13808](https://github.com/rapidsai/cudf/pull/13808)) [@davidwendt](https://github.com/davidwendt) +- Add minhash support for MurmurHash3_x64_128 ([#13796](https://github.com/rapidsai/cudf/pull/13796)) [@davidwendt](https://github.com/davidwendt) +- Remove unnecessary pointer copying in JIT GroupBy Apply ([#13792](https://github.com/rapidsai/cudf/pull/13792)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add 'poll' function to custreamz kafka consumer ([#13782](https://github.com/rapidsai/cudf/pull/13782)) [@jdye64](https://github.com/jdye64) +- Support `corr` in `GroupBy.apply` through the jit engine ([#13767](https://github.com/rapidsai/cudf/pull/13767)) [@shwina](https://github.com/shwina) +- Optionally write version 2 page headers in Parquet writer ([#13751](https://github.com/rapidsai/cudf/pull/13751)) [@etseidl](https://github.com/etseidl) +- Support more numeric types in `Groupby.apply` with `engine='jit'` ([#13729](https://github.com/rapidsai/cudf/pull/13729)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- [FEA] Add DELTA_BINARY_PACKED decoding support to Parquet reader ([#13637](https://github.com/rapidsai/cudf/pull/13637)) [@etseidl](https://github.com/etseidl) +- Read FIXED_LEN_BYTE_ARRAY as binary in parquet reader ([#13437](https://github.com/rapidsai/cudf/pull/13437)) [@PointKernel](https://github.com/PointKernel) + +## 🛠️ Improvements + +- Pin `dask` and `distributed` for `23.10` release ([#14225](https://github.com/rapidsai/cudf/pull/14225)) [@galipremsagar](https://github.com/galipremsagar) +- update rmm tag path ([#14195](https://github.com/rapidsai/cudf/pull/14195)) [@AyodeAwe](https://github.com/AyodeAwe) +- Disable `Recently Updated` Check ([#14193](https://github.com/rapidsai/cudf/pull/14193)) [@ajschmidt8](https://github.com/ajschmidt8) +- Move cpp/src/hash/hash_allocator.cuh to include/cudf/hashing/detail ([#14163](https://github.com/rapidsai/cudf/pull/14163)) [@davidwendt](https://github.com/davidwendt) +- Add Parquet reader benchmarks for row selection ([#14147](https://github.com/rapidsai/cudf/pull/14147)) [@vuule](https://github.com/vuule) +- Update image names ([#14145](https://github.com/rapidsai/cudf/pull/14145)) [@AyodeAwe](https://github.com/AyodeAwe) +- Support callables in DataFrame.assign ([#14142](https://github.com/rapidsai/cudf/pull/14142)) [@wence-](https://github.com/wence-) +- Reduce memory usage of as_categorical_column ([#14138](https://github.com/rapidsai/cudf/pull/14138)) [@wence-](https://github.com/wence-) +- Replace Python scalar conversions with libcudf ([#14124](https://github.com/rapidsai/cudf/pull/14124)) [@vyasr](https://github.com/vyasr) +- Update to clang 16.0.6. ([#14120](https://github.com/rapidsai/cudf/pull/14120)) [@bdice](https://github.com/bdice) +- Fix type of empty `Index` and raise warning in `Series` constructor ([#14116](https://github.com/rapidsai/cudf/pull/14116)) [@galipremsagar](https://github.com/galipremsagar) +- Add stream parameter to external dict APIs ([#14115](https://github.com/rapidsai/cudf/pull/14115)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Add fallback matrix for nvcomp. ([#14082](https://github.com/rapidsai/cudf/pull/14082)) [@bdice](https://github.com/bdice) +- [Java] Add recoverWithNull to JSONOptions and pass to Table.readJSON ([#14078](https://github.com/rapidsai/cudf/pull/14078)) [@andygrove](https://github.com/andygrove) +- Remove header tests ([#14072](https://github.com/rapidsai/cudf/pull/14072)) [@ajschmidt8](https://github.com/ajschmidt8) +- Refactor `contains_table` with cuco::static_set ([#14064](https://github.com/rapidsai/cudf/pull/14064)) [@PointKernel](https://github.com/PointKernel) +- Remove debug print in a Parquet test ([#14063](https://github.com/rapidsai/cudf/pull/14063)) [@vuule](https://github.com/vuule) +- Expose stream parameter in public nvtext ngram APIs ([#14061](https://github.com/rapidsai/cudf/pull/14061)) [@davidwendt](https://github.com/davidwendt) +- Expose stream parameter in public strings find APIs ([#14060](https://github.com/rapidsai/cudf/pull/14060)) [@davidwendt](https://github.com/davidwendt) +- Update doxygen to 1.9.1 ([#14059](https://github.com/rapidsai/cudf/pull/14059)) [@vyasr](https://github.com/vyasr) +- Remove the mr from the base fixture ([#14057](https://github.com/rapidsai/cudf/pull/14057)) [@vyasr](https://github.com/vyasr) +- Expose streams in public strings case APIs ([#14056](https://github.com/rapidsai/cudf/pull/14056)) [@davidwendt](https://github.com/davidwendt) +- Refactor libcudf indexalator to typed normalator ([#14043](https://github.com/rapidsai/cudf/pull/14043)) [@davidwendt](https://github.com/davidwendt) +- Use cudf::make_empty_column instead of column_view constructor ([#14030](https://github.com/rapidsai/cudf/pull/14030)) [@davidwendt](https://github.com/davidwendt) +- Remove quadratic runtime due to accessing Frame._dtypes in loop ([#14028](https://github.com/rapidsai/cudf/pull/14028)) [@wence-](https://github.com/wence-) +- Explicitly depend on zlib in conda recipes ([#14018](https://github.com/rapidsai/cudf/pull/14018)) [@wence-](https://github.com/wence-) +- Use grid_stride for stride computations. ([#13996](https://github.com/rapidsai/cudf/pull/13996)) [@bdice](https://github.com/bdice) +- Fix an issue where casting null-array to `object` dtype will result in a failure ([#13994](https://github.com/rapidsai/cudf/pull/13994)) [@galipremsagar](https://github.com/galipremsagar) +- Add tab as literal to cudf::test::to_string output ([#13993](https://github.com/rapidsai/cudf/pull/13993)) [@davidwendt](https://github.com/davidwendt) +- Enable `codes` dtype parity in pandas-compatibility mode for `factorize` API ([#13982](https://github.com/rapidsai/cudf/pull/13982)) [@galipremsagar](https://github.com/galipremsagar) +- Fix `CategoricalIndex` ordering in `Groupby.agg` when pandas-compatibility mode is enabled ([#13978](https://github.com/rapidsai/cudf/pull/13978)) [@galipremsagar](https://github.com/galipremsagar) +- Produce a fatal error if cudf is unable to find pyarrow include directory ([#13976](https://github.com/rapidsai/cudf/pull/13976)) [@cwharris](https://github.com/cwharris) +- Use `thread_index_type` in `partitioning.cu` ([#13973](https://github.com/rapidsai/cudf/pull/13973)) [@divyegala](https://github.com/divyegala) +- Use `cudf::thread_index_type` in `merge.cu` ([#13972](https://github.com/rapidsai/cudf/pull/13972)) [@divyegala](https://github.com/divyegala) +- Use `copy-pr-bot` ([#13970](https://github.com/rapidsai/cudf/pull/13970)) [@ajschmidt8](https://github.com/ajschmidt8) +- Use cudf::thread_index_type in strings custom kernels ([#13968](https://github.com/rapidsai/cudf/pull/13968)) [@davidwendt](https://github.com/davidwendt) +- Add `bytes_per_second` to hash_partition benchmark ([#13965](https://github.com/rapidsai/cudf/pull/13965)) [@Blonck](https://github.com/Blonck) +- Added pinned pool reservation API for java ([#13964](https://github.com/rapidsai/cudf/pull/13964)) [@revans2](https://github.com/revans2) +- Simplify wheel build scripts and allow alphas of RAPIDS dependencies ([#13963](https://github.com/rapidsai/cudf/pull/13963)) [@vyasr](https://github.com/vyasr) +- Add `bytes_per_second` to copy_if_else benchmark ([#13960](https://github.com/rapidsai/cudf/pull/13960)) [@Blonck](https://github.com/Blonck) +- Add pandas compatible output to `Series.unique` ([#13959](https://github.com/rapidsai/cudf/pull/13959)) [@galipremsagar](https://github.com/galipremsagar) +- Add `bytes_per_second` to compiled binaryop benchmark ([#13938](https://github.com/rapidsai/cudf/pull/13938)) [@Blonck](https://github.com/Blonck) +- Unpin `dask` and `distributed` for `23.10` development ([#13935](https://github.com/rapidsai/cudf/pull/13935)) [@galipremsagar](https://github.com/galipremsagar) +- Make HostColumnVector.getRefCount public ([#13934](https://github.com/rapidsai/cudf/pull/13934)) [@abellina](https://github.com/abellina) +- Use cuco::static_set in JSON tree algorithm ([#13928](https://github.com/rapidsai/cudf/pull/13928)) [@karthikeyann](https://github.com/karthikeyann) +- Add java API to get size of host memory needed to copy column view ([#13919](https://github.com/rapidsai/cudf/pull/13919)) [@revans2](https://github.com/revans2) +- Use cudf::size_type instead of int32 where appropriate in nvtext functions ([#13915](https://github.com/rapidsai/cudf/pull/13915)) [@davidwendt](https://github.com/davidwendt) +- Enable hugepage for arrow host allocations ([#13914](https://github.com/rapidsai/cudf/pull/13914)) [@madsbk](https://github.com/madsbk) +- Improve performance of nvtext::edit_distance ([#13912](https://github.com/rapidsai/cudf/pull/13912)) [@davidwendt](https://github.com/davidwendt) +- Ensure cudf internals use pylibcudf in pure Python mode ([#13909](https://github.com/rapidsai/cudf/pull/13909)) [@vyasr](https://github.com/vyasr) +- Use `empty()` instead of `size()` where possible ([#13908](https://github.com/rapidsai/cudf/pull/13908)) [@vuule](https://github.com/vuule) +- [JNI] Adds HostColumnVector.EventHandler for spillability checks ([#13898](https://github.com/rapidsai/cudf/pull/13898)) [@abellina](https://github.com/abellina) +- Return `Timestamp` & `Timedelta` for fetching scalars in `DatetimeIndex` & `TimedeltaIndex` ([#13896](https://github.com/rapidsai/cudf/pull/13896)) [@galipremsagar](https://github.com/galipremsagar) +- Allow explicit `shuffle="p2p"` within dask-cudf API ([#13893](https://github.com/rapidsai/cudf/pull/13893)) [@rjzamora](https://github.com/rjzamora) +- Disable creation of `DatetimeIndex` when `freq` is passed to `cudf.date_range` ([#13890](https://github.com/rapidsai/cudf/pull/13890)) [@galipremsagar](https://github.com/galipremsagar) +- Bring parity with pandas for `datetime` & `timedelta` comparison operations ([#13877](https://github.com/rapidsai/cudf/pull/13877)) [@galipremsagar](https://github.com/galipremsagar) +- Change `NA` to `NaT` for `datetime` and `timedelta` types ([#13868](https://github.com/rapidsai/cudf/pull/13868)) [@galipremsagar](https://github.com/galipremsagar) +- Raise error when `astype(object)` is called in pandas compatibility mode ([#13862](https://github.com/rapidsai/cudf/pull/13862)) [@galipremsagar](https://github.com/galipremsagar) +- Fixes a performance regression in FST ([#13850](https://github.com/rapidsai/cudf/pull/13850)) [@elstehle](https://github.com/elstehle) +- Set native handles to null on close in Java wrapper classes ([#13818](https://github.com/rapidsai/cudf/pull/13818)) [@jlowe](https://github.com/jlowe) +- Avoid use of CUDF_EXPECTS in libcudf unit tests outside of helper functions with return values ([#13812](https://github.com/rapidsai/cudf/pull/13812)) [@vuule](https://github.com/vuule) +- Update `lists::contains` to experimental row comparator ([#13810](https://github.com/rapidsai/cudf/pull/13810)) [@divyegala](https://github.com/divyegala) +- Reduce `lists::contains` dispatches for scalars ([#13805](https://github.com/rapidsai/cudf/pull/13805)) [@divyegala](https://github.com/divyegala) +- Long string optimization for string column parsing in JSON reader ([#13803](https://github.com/rapidsai/cudf/pull/13803)) [@karthikeyann](https://github.com/karthikeyann) +- Raise NotImplementedError for pd.SparseDtype ([#13798](https://github.com/rapidsai/cudf/pull/13798)) [@mroeschke](https://github.com/mroeschke) +- Remove the libcudf cudf::offset_type type ([#13788](https://github.com/rapidsai/cudf/pull/13788)) [@davidwendt](https://github.com/davidwendt) +- Move Spark-indpendent Table debug to cudf Java ([#13783](https://github.com/rapidsai/cudf/pull/13783)) [@gerashegalov](https://github.com/gerashegalov) +- Update to Cython 3.0.0 ([#13777](https://github.com/rapidsai/cudf/pull/13777)) [@vyasr](https://github.com/vyasr) +- Refactor Parquet reader handling of V2 page header info ([#13775](https://github.com/rapidsai/cudf/pull/13775)) [@etseidl](https://github.com/etseidl) +- Branch 23.10 merge 23.08 ([#13773](https://github.com/rapidsai/cudf/pull/13773)) [@vyasr](https://github.com/vyasr) +- Restructure JSON code to correctly reflect legacy/experimental status ([#13757](https://github.com/rapidsai/cudf/pull/13757)) [@vuule](https://github.com/vuule) +- Branch 23.10 merge 23.08 ([#13753](https://github.com/rapidsai/cudf/pull/13753)) [@vyasr](https://github.com/vyasr) +- Enforce deprecations in `23.10` ([#13732](https://github.com/rapidsai/cudf/pull/13732)) [@galipremsagar](https://github.com/galipremsagar) +- Upgrade to arrow 12 ([#13728](https://github.com/rapidsai/cudf/pull/13728)) [@galipremsagar](https://github.com/galipremsagar) +- Refactors JSON reader's pushdown automaton ([#13716](https://github.com/rapidsai/cudf/pull/13716)) [@elstehle](https://github.com/elstehle) +- Remove Arrow dependency from the `datasource.hpp` public header ([#13698](https://github.com/rapidsai/cudf/pull/13698)) [@vuule](https://github.com/vuule) + # cuDF 23.08.00 (9 Aug 2023) ## 🚨 Breaking Changes