diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8976a4b14cb..2c79cbb6b6c 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -40,7 +40,7 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow==12.0.1.* +- libarrow==12.0.0.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 @@ -69,7 +69,7 @@ dependencies: - pre-commit - protobuf>=4.21,<5 - ptxcompiler -- pyarrow==12.0.1.* +- pyarrow==12.0.0.* - pydata-sphinx-theme - pyorc - pytest diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index f54d78593c3..c96b7428882 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -17,6 +17,7 @@ dependencies: - cachetools - cmake>=3.26.4 - cuda-cudart-dev +- cuda-gdb - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev @@ -41,7 +42,7 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow==12.0.1.* +- libarrow==12.0.0.* - libcufile-dev - libcurand-dev - libkvikio==23.12.* @@ -66,7 +67,7 @@ dependencies: - pip - pre-commit - protobuf>=4.21,<5 -- pyarrow==12.0.1.* +- pyarrow==12.0.0.* - pydata-sphinx-theme - pyorc - pytest diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 54b687faa69..16b064a262e 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -61,7 +61,7 @@ requirements: - scikit-build >=0.13.1 - setuptools - dlpack >=0.5,<0.6.0a0 - - pyarrow =12 + - pyarrow =12.0.0 - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh index 5d8720f1c98..f4bb6e1bc91 100644 --- a/conda/recipes/cudf_kafka/build.sh +++ b/conda/recipes/cudf_kafka/build.sh @@ -1,4 +1,16 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # This assumes the script is executed from the root of the repo directory +# Need to set CUDA_HOME inside conda environments because the hacked together +# setup.py for cudf-kafka searches that way. +# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates +# cudf_kafka to use scikit-build +CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*} +if [[ ${CUDA_MAJOR} == "12" ]]; then + target_name="x86_64-linux" + if [[ ! $(arch) == "x86_64" ]]; then + target_name="sbsa-linux" + fi + export CUDA_HOME="${PREFIX}/targets/${target_name}/" +fi ./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index ec0cc402511..a79c23b7d98 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -33,6 +33,9 @@ build: - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64] - SCCACHE_S3_USE_SSL - SCCACHE_S3_NO_CREDENTIALS + # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates + # cudf_kafka to use scikit-build + - RAPIDS_CUDA_VERSION requirements: build: @@ -41,6 +44,11 @@ requirements: - {{ compiler('cxx') }} - ninja - sysroot_{{ target_platform }} {{ sysroot_version }} + # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates + # cudf_kafka to use scikit-build + {% if cuda_major == "12" %} + - cuda-gdb + {% endif %} host: - python - cython >=3.0.0 @@ -48,6 +56,9 @@ requirements: - cudf ={{ version }} - libcudf_kafka ={{ version }} - setuptools + {% if cuda_major == "12" %} + - cuda-cudart-dev + {% endif %} run: - python - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index b1f5b083e06..4d33bb89220 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -25,8 +25,8 @@ gtest_version: aws_sdk_cpp_version: - "<1.11" -libarrow_version: - - "=12" +libarrow: + - "==12.0.0" dlpack_version: - ">=0.5,<0.6.0a0" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 28357f0d96d..b9aff2a9c82 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -65,7 +65,7 @@ requirements: {% endif %} - cuda-version ={{ cuda_version }} - nvcomp {{ nvcomp_version }} - - libarrow {{ libarrow_version }} + - libarrow {{ libarrow }} - dlpack {{ dlpack_version }} - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} @@ -104,7 +104,7 @@ outputs: - nvcomp {{ nvcomp_version }} - librmm ={{ minor_version }} - libkvikio ={{ minor_version }} - - libarrow {{ libarrow_version }} + - libarrow {{ libarrow }} - dlpack {{ dlpack_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 10d3145a36f..c2d5cfbaf78 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -411,7 +411,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 12.0.1 + 12.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index f04b2fda2bf..ff148c59a23 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -20,9 +20,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -130,8 +130,8 @@ std::unique_ptr scatter_impl(rmm::device_uvector cons std::vector> children; children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(child_column)); - auto null_mask = - target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr}; + auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr) + : rmm::device_buffer{0, stream, mr}; // The output column from this function only has null masks copied from the target columns. // That is still not a correct final null mask for the scatter result. diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 672f479ad53..524296e60ca 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits); * * @param size The number of elements to be represented by the mask * @param state The desired state of the mask + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` for use as a null bitmask * satisfying the desired size and state @@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits); rmm::device_buffer create_null_mask( size_type size, mask_state state, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask( * @param begin_bit Index of the first bit to set (inclusive) * @param end_bit Index of the last bit to set (exclusive) * @param valid If true set all entries to valid; otherwise, set all to null + * @param stream CUDA stream used for device memory operations and kernel launches */ -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid); +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Creates a `device_buffer` from a slice of bitmask defined by a range @@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit * @param mask Bitmask residing in device memory whose bits will be copied * @param begin_bit Index of the first bit to be copied (inclusive) * @param end_bit Index of the last bit to be copied (exclusive) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[begin_bit, end_bit)` from `mask`. @@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask( bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask( * Returns empty `device_buffer` if the column is not nullable * * @param view Column view whose bitmask needs to be copied + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask. */ rmm::device_buffer copy_bitmask( column_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_and( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -163,11 +177,13 @@ std::pair bitmask_and( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_or( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -183,8 +199,12 @@ std::pair bitmask_or( * @param bitmask Validity bitmask residing in device memory. * @param start Index of the first bit to count (inclusive). * @param stop Index of the last bit to count (exclusive). + * @param stream CUDA stream used for device memory operations and kernel launches * @return The number of null elements in the specified range. */ -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop); +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 6b413ab2be4..53b04c4ca80 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -366,7 +366,7 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 5a0d3e4f120..3ff56eabe1e 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask, // Create a device_buffer for a null mask rmm::device_buffer create_null_mask(size_type size, mask_state state, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::create_null_mask(size, state, cudf::get_default_stream(), mr); + return detail::create_null_mask(size, state, stream, mr); } // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid) +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream) { - return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream()); + return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream); } namespace detail { @@ -511,33 +516,46 @@ std::pair bitmask_or(table_view const& view, rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr); } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr) +rmm::device_buffer copy_bitmask(column_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(view, stream, mr); } std::pair bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_and(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_and(view, stream, mr); } std::pair bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_or(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_or(view, stream, mr); } // Count non-zero bits in the specified range -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop) +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream) { - return detail::null_count(bitmask, start, stop, cudf::get_default_stream()); + CUDF_FUNC_RANGE(); + return detail::null_count(bitmask, start, stop, stream); } } // namespace cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d08c3025553..9b9e780965a 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span views, }); rmm::device_buffer null_mask = - create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr); + cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr); detail::concatenate_masks(views, static_cast(null_mask.data()), stream); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 11c27fc86e3..879ddb5048e 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl { // Compute null mask rmm::device_buffer null_mask = - target.nullable() ? copy_bitmask(target, stream, mr) - : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); + target.nullable() + ? detail::copy_bitmask(target, stream, mr) + : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); column null_mask_stub(data_type{type_id::STRUCT}, target.size(), rmm::device_buffer{}, diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 506832881a9..195c8924c9a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 37c99453361..42036b79751 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -753,7 +753,7 @@ class TranslationOp { RelativeOffsetT const relative_offset, SymbolT const read_symbol) const { - return translation_op(*this, state_id, match_id, relative_offset, read_symbol); + return translation_op(state_id, match_id, relative_offset, read_symbol); } template @@ -761,7 +761,7 @@ class TranslationOp { SymbolIndexT const match_id, SymbolT const read_symbol) const { - return translation_op(*this, state_id, match_id, read_symbol); + return translation_op(state_id, match_id, read_symbol); } }; diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index c9107357239..3702d94fd2b 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -91,6 +91,98 @@ void check_input_size(std::size_t input_size) namespace cudf::io::json { +// FST to help fixing the stack context of characters that follow the first record on each JSON line +namespace fix_stack_of_excess_chars { + +// Type used to represent the target state in the transition table +using StateT = char; + +// Type used to represent a symbol group id +using SymbolGroupT = uint8_t; + +/** + * @brief Definition of the DFA's states. + */ +enum class dfa_states : StateT { + // Before the first record on the JSON line + BEFORE, + // Within the first record on the JSON line + WITHIN, + // Excess data that follows the first record on the JSON line + EXCESS, + // Total number of states + NUM_STATES +}; + +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : SymbolGroupT { + ROOT, ///< Symbol for root stack context + DELIMITER, ///< Line delimiter symbol group + OTHER, ///< Symbol group that implicitly matches all other tokens + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +/** + * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group. + */ +struct SymbolPairToSymbolGroupId { + CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple symbol) const + { + auto const input_symbol = thrust::get<0>(symbol); + auto const stack_symbol = thrust::get<1>(symbol); + return static_cast( + input_symbol == '\n' + ? dfa_symbol_group_id::DELIMITER + : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER)); + } +}; + +/** + * @brief Translation function object that fixes the stack context of excess data that follows after + * the first JSON record on each line. + */ +struct TransduceInputOp { + template + constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + if (state_id == static_cast(dfa_states::EXCESS)) { return '_'; } + return thrust::get<1>(read_symbol); + } + + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + constexpr int32_t single_output_item = 1; + return single_output_item; + } +}; + +// Aliases for readability of the transition table +constexpr auto TT_BEFORE = dfa_states::BEFORE; +constexpr auto TT_INSIDE = dfa_states::WITHIN; +constexpr auto TT_EXCESS = dfa_states::EXCESS; + +// Transition table +std::array, TT_NUM_STATES> constexpr transition_table{ + {/* IN_STATE ROOT NEWLINE OTHER */ + /* TT_BEFORE */ {{TT_BEFORE, TT_BEFORE, TT_INSIDE}}, + /* TT_INSIDE */ {{TT_EXCESS, TT_BEFORE, TT_INSIDE}}, + /* TT_EXCESS */ {{TT_EXCESS, TT_BEFORE, TT_EXCESS}}}}; + +// The DFA's starting state +constexpr auto start_state = static_cast(dfa_states::BEFORE); +} // namespace fix_stack_of_excess_chars + // FST to prune tokens of invalid lines for recovering JSON lines format namespace token_filter { @@ -146,9 +238,8 @@ struct UnwrapTokenFromSymbolOp { * invalid lines. */ struct TransduceToken { - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&, - StateT const state_id, + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, SymbolGroupT const match_id, RelativeOffsetT const relative_offset, SymbolT const read_symbol) const @@ -165,9 +256,8 @@ struct TransduceToken { } } - template - constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&, - StateT const state_id, + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, SymbolGroupT const match_id, SymbolT const read_symbol) const { @@ -643,6 +733,11 @@ auto get_transition_table(json_format_cfg_t format) // PD_ANL describes the target state after a new line after encountering error state auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR; + // Target state after having parsed the first JSON value on a JSON line + // Spark has the special need to ignore everything that comes after the first JSON object + // on a JSON line instead of marking those as invalid + auto const PD_AFS = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_PVL : PD_ERR; + // First row: empty stack ("root" level of the JSON) // Second row: '[' on top of stack (we're parsing a list value) // Third row: '{' on top of stack (we're parsing a struct value) @@ -668,7 +763,7 @@ auto get_transition_table(json_format_cfg_t format) PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_PVL, PD_BOV, PD_AFS, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_BFN)] = { @@ -733,6 +828,18 @@ auto get_translation_table(bool recover_from_error) return regular_tokens; }; + /** + * @brief Helper function that returns `recovering_tokens` if `recover_from_error` is true and + * returns `regular_tokens` otherwise. This is used to ignore excess characters after the first + * value in the case of JSON lines that recover from invalid lines, as Spark ignores any excess + * characters that follow the first record on a JSON line. + */ + auto alt_tokens = [recover_from_error](std::vector regular_tokens, + std::vector recovering_tokens) { + if (recover_from_error) { return recovering_tokens; } + return regular_tokens; + }; + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ {StructBegin}, // OPENING_BRACE @@ -920,18 +1027,18 @@ auto get_translation_table(bool recover_from_error) {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PVL)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}, {}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {alt_tokens({ErrorBegin}, {})}, // OPENING_BRACE + {alt_tokens({ErrorBegin}, {})}, // OPENING_BRACKET + {alt_tokens({ErrorBegin}, {})}, // CLOSING_BRACE + {alt_tokens({ErrorBegin}, {})}, // CLOSING_BRACKET + {alt_tokens({ErrorBegin}, {})}, // QUOTE + {alt_tokens({ErrorBegin}, {})}, // ESCAPE + {alt_tokens({ErrorBegin}, {})}, // COMMA + {alt_tokens({ErrorBegin}, {})}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {alt_tokens({ErrorBegin}, {})}, // OTHER /*LIST*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -1446,6 +1553,26 @@ std::pair, rmm::device_uvector> ge // character. auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data()); + // Spark, as the main stakeholder in the `recover_from_error` option, has the specific need to + // ignore any characters that follow the first value on each JSON line. This is an FST that + // fixes the stack context for those excess characters. That is, that all those excess characters + // will be interpreted in the root stack context + if (recover_from_error) { + auto fix_stack_of_excess_chars = fst::detail::make_fst( + fst::detail::make_symbol_group_lookup_op( + fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}), + fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table), + fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}), + stream); + fix_stack_of_excess_chars.Transduce(zip_in, + static_cast(json_in.size()), + stack_symbols.data(), + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + fix_stack_of_excess_chars::start_state, + stream); + } + constexpr auto max_translation_table_size = tokenizer_pda::NUM_PDA_SGIDS * static_cast(tokenizer_pda::pda_state_t::PD_NUM_STATES); diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index a513e6674b4..e3b23f4c0a0 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -46,12 +46,6 @@ namespace cudf::io::parquet::detail { // encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix // lengths, followed by the concatenated suffix data. -// TODO: The delta encodings use ULEB128 integers, but for now we're only -// using max 64 bits. Need to see what the performance impact is of using -// __int128_t rather than int64_t. -using uleb128_t = uint64_t; -using zigzag128_t = int64_t; - // we decode one mini-block at a time. max mini-block size seen is 64. constexpr int delta_rolling_buf_size = 128; diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh new file mode 100644 index 00000000000..28f8cdfe2c1 --- /dev/null +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "parquet_gpu.hpp" + +#include +#include + +#include + +namespace cudf::io::parquet::detail { + +namespace delta { + +inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v) +{ + while (v > 0x7f) { + *(p++) = v | 0x80; + v >>= 7; + } + *(p++) = v; +} + +inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v) +{ + zigzag128_t s = (v < 0); + put_uleb128(p, (v ^ -s) * 2 + s); +} + +// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming +// too much shared memory. +// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block +// to be a multiple of 32. +constexpr int block_size = 128; +constexpr int num_mini_blocks = 4; +constexpr int values_per_mini_block = block_size / num_mini_blocks; +constexpr int buffer_size = 2 * block_size; + +// An extra sanity checks to enforce compliance with the parquet specification. +static_assert(block_size % 128 == 0); +static_assert(values_per_mini_block % 32 == 0); + +using block_reduce = cub::BlockReduce; +using warp_reduce = cub::WarpReduce; +using index_scan = cub::BlockScan; + +constexpr int rolling_idx(int index) { return rolling_index(index); } + +// Version of bit packer that can handle up to 64 bits values. +// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long +// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a +// measurable impact to using the wider types. +template +inline __device__ void bitpack_mini_block( + uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space) +{ + using wide_type = + std::conditional_t, __uint128_t, uint64_t>; + using cudf::detail::warp_size; + scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1; + auto constexpr div = sizeof(scratch_type) * 8; + + auto const lane_id = threadIdx.x % warp_size; + auto const warp_id = threadIdx.x / warp_size; + + auto const scratch = reinterpret_cast(temp_space) + warp_id * warp_size; + + // zero out scratch + scratch[lane_id] = 0; + __syncwarp(); + + // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...) + // like what's done for the RLE encoder. + if (nbits == div) { + if (lane_id < count) { + for (int i = 0; i < sizeof(scratch_type); i++) { + dst[lane_id * sizeof(scratch_type) + i] = val & 0xff; + val >>= 8; + } + } + return; + } + + if (lane_id <= count) { + // Shift symbol left by up to mask bits. + wide_type v2 = val; + v2 <<= (lane_id * nbits) & mask; + + // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules. + scratch_type v1[2]; + memcpy(&v1, &v2, sizeof(wide_type)); + + // Atomically write result to scratch. + if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); } + if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); } + } + __syncwarp(); + + // Copy scratch data to final destination. + auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U); + auto const scratch_bytes = reinterpret_cast(scratch); + + for (uint32_t i = lane_id; i < available_bytes; i += warp_size) { + dst[i] = scratch_bytes[i]; + } + __syncwarp(); +} + +} // namespace delta + +// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input +// 128 values with validity at a time, saving them until there are enough values for a block +// to be written. +// T is the input data type (either zigzag128_t or uleb128_t). +template +class delta_binary_packer { + private: + uint8_t* _dst; // sink to dump encoded values to + T* _buffer; // buffer to store values to be encoded + size_type _current_idx; // index of first value in buffer + uint32_t _num_values; // total number of values to encode + size_type _values_in_buffer; // current number of values stored in _buffer + uint8_t _mb_bits[delta::num_mini_blocks]; // bitwidth for each mini-block + + // pointers to shared scratch memory for the warp and block scans/reduces + delta::index_scan::TempStorage* _scan_tmp; + delta::warp_reduce::TempStorage* _warp_tmp; + delta::block_reduce::TempStorage* _block_tmp; + + void* _bitpack_tmp; // pointer to shared scratch memory used in bitpacking + + // Write the delta binary header. Only call from thread 0. + inline __device__ void write_header() + { + delta::put_uleb128(_dst, delta::block_size); + delta::put_uleb128(_dst, delta::num_mini_blocks); + delta::put_uleb128(_dst, _num_values); + delta::put_zz128(_dst, _buffer[0]); + } + + // Write the block header. Only call from thread 0. + inline __device__ void write_block_header(zigzag128_t block_min) + { + delta::put_zz128(_dst, block_min); + memcpy(_dst, _mb_bits, 4); + _dst += 4; + } + + // Signed subtraction with defined wrapping behavior. + inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b) + { + return static_cast(static_cast(a) - static_cast(b)); + } + + public: + inline __device__ auto num_values() const { return _num_values; } + + // Initialize the object. Only call from thread 0. + inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage) + { + _dst = dest; + _num_values = num_values; + _buffer = buffer; + _scan_tmp = reinterpret_cast(temp_storage); + _warp_tmp = reinterpret_cast(temp_storage); + _block_tmp = reinterpret_cast(temp_storage); + _bitpack_tmp = _buffer + delta::buffer_size; + _current_idx = 0; + _values_in_buffer = 0; + } + + // Each thread calls this to add its current value. + inline __device__ void add_value(T value, bool is_valid) + { + // Figure out the correct position for the given value. + size_type const valid = is_valid; + size_type pos; + size_type num_valid; + delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid); + + if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; } + __syncthreads(); + + if (threadIdx.x == 0) { + _values_in_buffer += num_valid; + // if first pass write header + if (_current_idx == 0) { + write_header(); + _current_idx = 1; + _values_in_buffer -= 1; + } + } + __syncthreads(); + + if (_values_in_buffer >= delta::block_size) { flush(); } + } + + // Called by each thread to flush data to the sink. + inline __device__ uint8_t const* flush() + { + using cudf::detail::warp_size; + __shared__ zigzag128_t block_min; + + int const t = threadIdx.x; + int const warp_id = t / warp_size; + int const lane_id = t % warp_size; + + if (_values_in_buffer <= 0) { return _dst; } + + // Calculate delta for this thread. + size_type const idx = _current_idx + t; + zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)], + _buffer[delta::rolling_idx(idx - 1)]) + : std::numeric_limits::max(); + + // Find min delta for the block. + auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min()); + + if (t == 0) { block_min = min_delta; } + __syncthreads(); + + // Compute frame of reference for the block. + uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0; + + // Get max normalized delta for each warp, and use that to determine how many bits to use + // for the bitpacking of this warp. + zigzag128_t const warp_max = + delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max()); + __syncwarp(); + + if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); } + __syncthreads(); + + // write block header + if (t == 0) { write_block_header(block_min); } + __syncthreads(); + + // Now each warp encodes its data...can calculate starting offset with _mb_bits. + // NOTE: using a switch here rather than a loop because the compiler produces code that + // uses fewer registers. + int cumulative_bits = 0; + switch (warp_id) { + case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]]; + case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]]; + case 1: cumulative_bits += _mb_bits[0]; + } + uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8; + + // encoding happens here + auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block; + if (warp_idx < _num_values) { + auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx); + if (_mb_bits[warp_id] > 32) { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } else { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } + } + __syncthreads(); + + // Last warp updates global delta ptr. + if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) { + _dst = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8; + _current_idx = min(warp_idx + delta::values_per_mini_block, _num_values); + _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U); + } + __syncthreads(); + + return _dst; + } +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 78873d5e8ca..1e4f061d2e0 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "delta_enc.cuh" #include "parquet_gpu.cuh" #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -41,13 +43,19 @@ #include #include +#include + namespace cudf::io::parquet::detail { namespace { using ::cudf::detail::device_2dspan; -constexpr uint32_t rle_buffer_size = (1 << 9); +constexpr int encode_block_size = 128; +constexpr int rle_buffer_size = 2 * encode_block_size; +constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; + +constexpr int rolling_idx(int pos) { return rolling_index(pos); } // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; @@ -69,6 +77,7 @@ struct frag_init_state_s { PageFragment frag; }; +template struct page_enc_state_s { uint8_t* cur; //!< current output ptr uint8_t* rle_out; //!< current RLE write ptr @@ -81,14 +90,15 @@ struct page_enc_state_s { uint32_t rle_rpt_count; uint32_t page_start_val; uint32_t chunk_start_val; - volatile uint32_t rpt_map[4]; - volatile uint32_t scratch_red[32]; + volatile uint32_t rpt_map[num_encode_warps]; EncPage page; EncColumnChunk ck; parquet_column_device_view col; - uint32_t vals[rle_buffer_size]; + uint32_t vals[rle_buf_size]; }; +using rle_page_enc_state_s = page_enc_state_s; + /** * @brief Returns the size of the type in the Parquet file. */ @@ -205,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t) } } +/** + * @brief Determine the correct page encoding for the given page parameters. + * + * This is only used by the plain and dictionary encoders. Delta encoders will set the page + * encoding directly. + */ Encoding __device__ determine_encoding(PageType page_type, Type physical_type, bool use_dictionary, @@ -216,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type, switch (page_type) { case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN; case PageType::DATA_PAGE_V2: - // TODO need to work in delta encodings here when they're added return physical_type == BOOLEAN ? Encoding::RLE : use_dictionary ? Encoding::RLE_DICTIONARY : Encoding::PLAIN; @@ -236,6 +251,50 @@ struct BitwiseOr { } }; +// I is the column type from the input table +template +__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, + uint32_t valid_count, + uint64_t* buffer, + void* temp_space) +{ + using output_type = std::conditional_t, zigzag128_t, uleb128_t>; + __shared__ delta_binary_packer packer; + + auto const t = threadIdx.x; + if (t == 0) { + packer.init(s->cur, valid_count, reinterpret_cast(buffer), temp_space); + } + __syncthreads(); + + // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64. + // might need to modify this if there's a big performance hit in the 32-bit case. + int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size); + + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx = s->page_start_val + val_idx_in_block; + + bool const is_valid = + (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx) + : false; + + cur_val_idx += nvals; + + output_type v = s->col.leaf_column->element(val_idx); + if (scale < 0) { + v /= -scale; + } else { + v *= scale; + } + packer.add_value(v, is_valid); + } + + return packer.flush(); +} + } // anonymous namespace // blockDim {512,1,1} @@ -323,6 +382,29 @@ __global__ void __launch_bounds__(128) } } +__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values) +{ + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + auto const vals_per_block = delta::block_size; + size_t const num_blocks = util::div_rounding_up_unsafe(num_values, vals_per_block); + // need max dtype_len + 1 bytes for min_delta + // one byte per mini block for the bitwidth + // and block_size * dtype_len bytes for the actual encoded data + auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len; + + // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks, + // max 5 bytes for number of values, and max dtype_len + 1 for first value. + auto const header_size = 2 + 1 + 5 + dtype_len + 1; + + return header_size + num_blocks * block_size; +} + // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, @@ -354,6 +436,14 @@ __global__ void __launch_bounds__(128) page_g = {}; } __syncthreads(); + + // if writing delta encoded values, we're going to need to know the data length to get a guess + // at the worst case number of bytes needed to encode. + auto const physical_type = col_g.physical_type; + auto const type_id = col_g.leaf_column->type().id(); + auto const is_use_delta = + write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64); + if (t < 32) { uint32_t fragments_in_chunk = 0; uint32_t rows_in_page = 0; @@ -403,9 +493,12 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) pages[ck_g.first_page] = page_g; - if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size; - if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g; + if (not pages.empty()) { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + pages[ck_g.first_page] = page_g; + } + if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; } + if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; } } num_pages = 1; } @@ -505,7 +598,12 @@ __global__ void __launch_bounds__(128) page_g.num_values = values_in_page; auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); - auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; + // get a different bound if using delta encoding + if (is_use_delta) { + page_size = + max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values)); + } + auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; // page size must fit in 32-bit signed integer if (max_data_size > std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); @@ -525,7 +623,16 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; } + if (not pages.empty()) { + if (is_use_delta) { + page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; + } else if (ck_g.use_dictionary || physical_type == BOOLEAN) { + page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + } else { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + } + pages[ck_g.first_page + num_pages] = page_g; + } if (not page_sizes.empty()) { page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size; } @@ -789,8 +896,12 @@ inline __device__ void PackLiterals( * @param[in] t thread id (0..127) */ static __device__ void RleEncode( - page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) + rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) { + using cudf::detail::warp_size; + auto const lane_id = t % warp_size; + auto const warp_id = t / warp_size; + uint32_t rle_pos = s->rle_pos; uint32_t rle_run = s->rle_run; @@ -798,20 +909,20 @@ static __device__ void RleEncode( uint32_t pos = rle_pos + t; if (rle_run > 0 && !(rle_run & 1)) { // Currently in a long repeat run - uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val); + uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val); uint32_t rle_rpt_count, max_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu); - if (!t) { + if (t == 0) { uint32_t last_idx = __ffs(c32) - 1; s->rle_rpt_count = - last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); + last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); } } __syncthreads(); - max_rpt_count = min(numvals - rle_pos, 128); + max_rpt_count = min(numvals - rle_pos, encode_block_size); rle_rpt_count = s->rle_rpt_count; rle_run += rle_rpt_count << 1; rle_pos += rle_rpt_count; @@ -828,17 +939,17 @@ static __device__ void RleEncode( } } else { // New run or in a literal run - uint32_t v0 = s->vals[pos & (rle_buffer_size - 1)]; - uint32_t v1 = s->vals[(pos + 1) & (rle_buffer_size - 1)]; + uint32_t v0 = s->vals[rolling_idx(pos)]; + uint32_t v1 = s->vals[rolling_idx(pos + 1)]; uint32_t mask = ballot(pos + 1 < numvals && v0 == v1); - uint32_t maxvals = min(numvals - rle_pos, 128); + uint32_t maxvals = min(numvals - rle_pos, encode_block_size); uint32_t rle_lit_count, rle_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { // Repeat run can only start on a multiple of 8 values - uint32_t idx8 = (t * 8) >> 5; - uint32_t pos8 = (t * 8) & 0x1f; + uint32_t idx8 = (t * 8) / warp_size; + uint32_t pos8 = (t * 8) % warp_size; uint32_t m0 = (idx8 < 4) ? s->rpt_map[idx8] : 0; uint32_t m1 = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0; uint32_t needed_mask = kRleRunMask[nbits - 1]; @@ -847,8 +958,8 @@ static __device__ void RleEncode( uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals; uint32_t rpt_len = 0; if (rle_run_start < maxvals) { - uint32_t idx_cur = rle_run_start >> 5; - uint32_t idx_ofs = rle_run_start & 0x1f; + uint32_t idx_cur = rle_run_start / warp_size; + uint32_t idx_ofs = rle_run_start % warp_size; while (idx_cur < 4) { m0 = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0; m1 = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0; @@ -857,7 +968,7 @@ static __device__ void RleEncode( rpt_len += __ffs(mask) - 1; break; } - rpt_len += 32; + rpt_len += warp_size; idx_cur++; } } @@ -928,7 +1039,7 @@ static __device__ void RleEncode( * @param[in] flush nonzero if last batch in block * @param[in] t thread id (0..127) */ -static __device__ void PlainBoolEncode(page_enc_state_s* s, +static __device__ void PlainBoolEncode(rle_page_enc_state_s* s, uint32_t numvals, uint32_t flush, uint32_t t) @@ -938,7 +1049,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s, while (rle_pos < numvals) { uint32_t pos = rle_pos + t; - uint32_t v = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0; + uint32_t v = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0; uint32_t n = min(numvals - rle_pos, 128); uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3; if (!nbytes) { break; } @@ -992,28 +1103,22 @@ __device__ auto julian_days_with_time(int64_t v) return std::make_pair(dur_time_of_day_nanos, julian_days); } +// this has been split out into its own kernel because of the amount of shared memory required +// for the state buffer. encode kernels that don't use the RLE buffer can get started while +// the level data is encoded. // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, - device_span> comp_in, - device_span> comp_out, - device_span comp_results, - bool write_v2_headers) +__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span pages, + bool write_v2_headers, + encode_kernel_mask kernel_mask) { - __shared__ __align__(8) page_enc_state_s state_g; - using block_reduce = cub::BlockReduce; - using block_scan = cub::BlockScan; - __shared__ union { - typename block_reduce::TempStorage reduce_storage; - typename block_scan::TempStorage scan_storage; - } temp_storage; + __shared__ __align__(8) rle_page_enc_state_s state_g; - page_enc_state_s* const s = &state_g; - auto const t = threadIdx.x; + auto* const s = &state_g; + uint32_t const t = threadIdx.x; if (t == 0) { - state_g = page_enc_state_s{}; + state_g = rle_page_enc_state_s{}; s->page = pages[blockIdx.x]; s->ck = *s->page.chunk; s->col = *s->ck.col_desc; @@ -1026,6 +1131,8 @@ __global__ void __launch_bounds__(128, 8) } __syncthreads(); + if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; } + auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2; // Encode Repetition and Definition levels @@ -1078,23 +1185,24 @@ __global__ void __launch_bounds__(128, 8) } while (is_col_struct); return def; }(); - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl; + s->vals[rolling_idx(rle_numvals + t)] = def_lvl; __syncthreads(); rle_numvals += nrows; RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; s->page.def_lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } } } else if (s->page.page_type != PageType::DICTIONARY_PAGE && @@ -1121,29 +1229,121 @@ __global__ void __launch_bounds__(128, 8) uint32_t idx = page_first_val_idx + rle_numvals + t; uint32_t lvl_val = (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0; - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val; + s->vals[rolling_idx(rle_numvals + t)] = lvl_val; __syncthreads(); rle_numvals += nvals; RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } }; encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes); __syncthreads(); encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes); } + + if (t == 0) { pages[blockIdx.x] = s->page; } +} + +template +__device__ void finish_page_encode(state_buf* s, + uint32_t valid_count, + uint8_t const* end_ptr, + device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + auto const t = threadIdx.x; + + // V2 does not compress rep and def level data + size_t const skip_comp_size = + write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0; + + if (t == 0) { + // only need num_nulls for v2 data page headers + if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; } + uint8_t const* const base = s->page.page_data + s->page.max_hdr_size; + auto const actual_data_size = static_cast(end_ptr - base); + if (actual_data_size > s->page.max_data_size) { + CUDF_UNREACHABLE("detected possible page data corruption"); + } + s->page.max_data_size = actual_data_size; + if (not comp_in.empty()) { + comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, + 0}; // size is unused + } + pages[blockIdx.x] = s->page; + if (not comp_results.empty()) { + comp_results[blockIdx.x] = {0, compression_status::FAILURE}; + pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + } + } + + // copy uncompressed bytes over + if (skip_comp_size != 0 && not comp_in.empty()) { + uint8_t* const src = s->page.page_data + s->page.max_hdr_size; + uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; + for (int i = t; i < skip_comp_size; i += block_size) { + dst[i] = src[i]; + } + } +} + +// PLAIN page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodePages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; } + // Encode data values __syncthreads(); auto const physical_type = s->col.physical_type; @@ -1155,10 +1355,6 @@ __global__ void __launch_bounds__(128, 8) return dtype_len_out; }(); - auto const dict_bits = (physical_type == BOOLEAN) ? 1 - : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) - ? s->ck.dict_rle_bits - : -1; if (t == 0) { uint8_t* dst = s->cur; s->rle_run = 0; @@ -1167,219 +1363,315 @@ __global__ void __launch_bounds__(128, 8) s->rle_out = dst; s->page.encoding = determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); - if (dict_bits >= 0 && physical_type != BOOLEAN) { - dst[0] = dict_bits; - s->rle_out = dst + 1; - } else if (is_v2 && physical_type == BOOLEAN) { - // save space for RLE length. we don't know the total length yet. - s->rle_out = dst + RLE_LENGTH_FIELD_LEN; - s->rle_len_pos = dst; - } s->page_start_val = row_to_value_idx(s->page.start_row, s->col); s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); } __syncthreads(); + uint32_t num_valid = 0; for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { - uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128); + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); uint32_t len, pos; auto [is_valid, val_idx] = [&]() { uint32_t val_idx; uint32_t is_valid; - size_type val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_block = cur_val_idx + t; if (s->page.page_type == PageType::DICTIONARY_PAGE) { val_idx = val_idx_in_block; is_valid = (val_idx < s->page.num_leaf_values); if (is_valid) { val_idx = s->ck.dict_data[val_idx]; } } else { - size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) : 0; - val_idx = - (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + val_idx = val_idx_in_leaf_col; } return std::make_tuple(is_valid, val_idx); }(); - if (is_valid) num_valid++; - + if (is_valid) { num_valid++; } cur_val_idx += nvals; - if (dict_bits >= 0) { - // Dictionary encoding - if (dict_bits > 0) { - uint32_t rle_numvals; - uint32_t rle_numvals_in_block; - block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); - rle_numvals = s->rle_numvals; - if (is_valid) { - uint32_t v; - if (physical_type == BOOLEAN) { - v = s->col.leaf_column->element(val_idx); - } else { - v = s->ck.dict_index[val_idx]; - } - s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v; - } - rle_numvals += rle_numvals_in_block; - __syncthreads(); - if (!is_v2 && physical_type == BOOLEAN) { - PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); - } else { - RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); + + // Non-dictionary encoding + uint8_t* dst = s->cur; + + if (is_valid) { + len = dtype_len_out; + if (physical_type == BYTE_ARRAY) { + if (type_id == type_id::STRING) { + len += s->col.leaf_column->element(val_idx).size_bytes(); + } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { + len += + get_element(*s->col.leaf_column, val_idx).size_bytes(); } - __syncthreads(); } - if (t == 0) { s->cur = s->rle_out; } - __syncthreads(); } else { - // Non-dictionary encoding - uint8_t* dst = s->cur; - - if (is_valid) { - len = dtype_len_out; - if (physical_type == BYTE_ARRAY) { - if (type_id == type_id::STRING) { - len += s->col.leaf_column->element(val_idx).size_bytes(); - } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { - len += - get_element(*s->col.leaf_column, val_idx).size_bytes(); + len = 0; + } + uint32_t total_len = 0; + block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); + __syncthreads(); + if (t == 0) { s->cur = dst + total_len; } + if (is_valid) { + switch (physical_type) { + case INT32: [[fallthrough]]; + case FLOAT: { + auto const v = [dtype_len = dtype_len_in, + idx = val_idx, + col = s->col.leaf_column, + scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { + switch (dtype_len) { + case 8: return col->element(idx) * scale; + case 4: return col->element(idx) * scale; + case 2: return col->element(idx) * scale; + default: return col->element(idx) * scale; + } + }(); + + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + } break; + case INT64: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } + } + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + } break; + case INT96: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } } - } - } else { - len = 0; - } - uint32_t total_len = 0; - block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); - __syncthreads(); - if (t == 0) { s->cur = dst + total_len; } - if (is_valid) { - switch (physical_type) { - case INT32: [[fallthrough]]; - case FLOAT: { - auto const v = [dtype_len = dtype_len_in, - idx = val_idx, - col = s->col.leaf_column, - scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { - switch (dtype_len) { - case 8: return col->element(idx) * scale; - case 4: return col->element(idx) * scale; - case 2: return col->element(idx) * scale; - default: return col->element(idx) * scale; - } - }(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - } break; - case INT64: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + auto const [last_day_nanos, julian_days] = [&] { + using namespace cuda::std::chrono; + switch (s->col.leaf_column->type().id()) { + case type_id::TIMESTAMP_SECONDS: + case type_id::TIMESTAMP_MILLISECONDS: { + return julian_days_with_time(v); + } break; + case type_id::TIMESTAMP_MICROSECONDS: + case type_id::TIMESTAMP_NANOSECONDS: { + return julian_days_with_time(v); + } break; } - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - } break; - case INT96: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + return julian_days_with_time(0); + }(); + + // the 12 bytes of fixed length data. + v = last_day_nanos.count(); + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + uint32_t w = julian_days.count(); + dst[pos + 8] = w; + dst[pos + 9] = w >> 8; + dst[pos + 10] = w >> 16; + dst[pos + 11] = w >> 24; + } break; + + case DOUBLE: { + auto v = s->col.leaf_column->element(val_idx); + memcpy(dst + pos, &v, 8); + } break; + case BYTE_ARRAY: { + auto const bytes = [](cudf::type_id const type_id, + column_device_view const* leaf_column, + uint32_t const val_idx) -> void const* { + switch (type_id) { + case type_id::STRING: + return reinterpret_cast( + leaf_column->element(val_idx).data()); + case type_id::LIST: + return reinterpret_cast( + get_element(*(leaf_column), val_idx).data()); + default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); } + }(type_id, s->col.leaf_column, val_idx); + uint32_t v = len - 4; // string length + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + if (v != 0) memcpy(dst + pos + 4, bytes, v); + } break; + case FIXED_LEN_BYTE_ARRAY: { + if (type_id == type_id::DECIMAL128) { + // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian + auto const v = s->col.leaf_column->element(val_idx).value(); + auto const v_char_ptr = reinterpret_cast(&v); + thrust::copy(thrust::seq, + thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), + thrust::make_reverse_iterator(v_char_ptr), + dst + pos); + } + } break; + } + } + __syncthreads(); + } - auto const [last_day_nanos, julian_days] = [&] { - using namespace cuda::std::chrono; - switch (s->col.leaf_column->type().id()) { - case type_id::TIMESTAMP_SECONDS: - case type_id::TIMESTAMP_MILLISECONDS: { - return julian_days_with_time(v); - } break; - case type_id::TIMESTAMP_MICROSECONDS: - case type_id::TIMESTAMP_NANOSECONDS: { - return julian_days_with_time(v); - } break; - } - return julian_days_with_time(0); - }(); - - // the 12 bytes of fixed length data. - v = last_day_nanos.count(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - uint32_t w = julian_days.count(); - dst[pos + 8] = w; - dst[pos + 9] = w >> 8; - dst[pos + 10] = w >> 16; - dst[pos + 11] = w >> 24; - } break; + uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); - case DOUBLE: { - auto v = s->col.leaf_column->element(val_idx); - memcpy(dst + pos, &v, 8); - } break; - case BYTE_ARRAY: { - auto const bytes = [](cudf::type_id const type_id, - column_device_view const* leaf_column, - uint32_t const val_idx) -> void const* { - switch (type_id) { - case type_id::STRING: - return reinterpret_cast( - leaf_column->element(val_idx).data()); - case type_id::LIST: - return reinterpret_cast( - get_element(*(leaf_column), val_idx).data()); - default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); - } - }(type_id, s->col.leaf_column, val_idx); - uint32_t v = len - 4; // string length - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - if (v != 0) memcpy(dst + pos + 4, bytes, v); - } break; - case FIXED_LEN_BYTE_ARRAY: { - if (type_id == type_id::DECIMAL128) { - // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian - auto const v = s->col.leaf_column->element(val_idx).value(); - auto const v_char_ptr = reinterpret_cast(&v); - thrust::copy(thrust::seq, - thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), - thrust::make_reverse_iterator(v_char_ptr), - dst + pos); - } - } break; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DICTIONARY page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDictPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) rle_page_enc_state_s state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = rle_page_enc_state_s{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + // TODO assert dict_bits >= 0 + auto const dict_bits = (physical_type == BOOLEAN) ? 1 + : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) + ? s->ck.dict_rle_bits + : -1; + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = + determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); + if (dict_bits >= 0 && physical_type != BOOLEAN) { + dst[0] = dict_bits; + s->rle_out = dst + 1; + } else if (write_v2_headers && physical_type == BOOLEAN) { + // save space for RLE length. we don't know the total length yet. + s->rle_out = dst + RLE_LENGTH_FIELD_LEN; + s->rle_len_pos = dst; + } + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + + auto [is_valid, val_idx] = [&]() { + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) + : 0; + // need to test for use_dictionary because it might be boolean + uint32_t const val_idx = + (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + return std::make_tuple(is_valid, val_idx); + }(); + + if (is_valid) { num_valid++; } + cur_val_idx += nvals; + + // Dictionary encoding + if (dict_bits > 0) { + uint32_t rle_numvals; + uint32_t rle_numvals_in_block; + uint32_t pos; + block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); + rle_numvals = s->rle_numvals; + if (is_valid) { + uint32_t v; + if (physical_type == BOOLEAN) { + v = s->col.leaf_column->element(val_idx); + } else { + v = s->ck.dict_index[val_idx]; } + s->vals[rolling_idx(rle_numvals + pos)] = v; + } + rle_numvals += rle_numvals_in_block; + __syncthreads(); + if ((!write_v2_headers) && (physical_type == BOOLEAN)) { + PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); + } else { + RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); } __syncthreads(); } + if (t == 0) { s->cur = s->rle_out; } + __syncthreads(); } uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); @@ -1392,37 +1684,137 @@ __global__ void __launch_bounds__(128, 8) __syncwarp(); } - // V2 does not compress rep and def level data - size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DELTA_BINARY_PACKED page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDeltaBinaryPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results) +{ + // block of shared memory for value storage and bit packing + __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size]; + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename delta::index_scan::TempStorage delta_index_tmp; + typename delta::block_reduce::TempStorage delta_reduce_tmp; + typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks]; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; if (t == 0) { - s->page.num_nulls = s->page.num_values - valid_count; - uint8_t* const base = s->page.page_data + s->page.max_hdr_size; - auto const actual_data_size = static_cast(s->cur - base); - if (actual_data_size > s->page.max_data_size) { - CUDF_UNREACHABLE("detected possible page data corruption"); - } - s->page.max_data_size = actual_data_size; - if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, - 0}; // size is unused - } - pages[blockIdx.x] = s->page; - if (not comp_results.empty()) { - comp_results[blockIdx.x] = {0, compression_status::FAILURE}; - pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = Encoding::DELTA_BINARY_PACKED; + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + // need to know the number of valid values for the null values calculation and to size + // the delta binary encoder. + uint32_t valid_count = 0; + if (not s->col.leaf_column->nullable()) { + valid_count = s->page.num_leaf_values; + } else { + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + if (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values && + s->col.leaf_column->is_valid(val_idx_in_leaf_col)) { + num_valid++; + } + cur_val_idx += nvals; } + valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); } - // copy over uncompressed data - if (skip_comp_size != 0 && not comp_in.empty()) { - uint8_t const* const src = s->page.page_data + s->page.max_hdr_size; - uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; - for (int i = t; i < skip_comp_size; i += block_size) { - dst[i] = src[i]; + uint8_t const* delta_ptr = nullptr; // this will be the end of delta block pointer + + if (physical_type == INT32) { + switch (dtype_len_in) { + case 8: { + // only DURATIONS map to 8 bytes, so safe to just use signed here? + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + break; + } + case 4: { + if (type_id == type_id::UINT32) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 2: { + if (type_id == type_id::UINT16) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 1: { + if (type_id == type_id::UINT8) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED"); + } + } else { + if (type_id == type_id::UINT64) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); } } + + finish_page_encode( + s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true); } constexpr int decide_compression_warps_in_block = 4; @@ -1457,7 +1849,8 @@ __global__ void __launch_bounds__(decide_compression_block_size) for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; auto const page_data_size = curr_page.max_data_size; - auto const lvl_bytes = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes; + auto const is_v2 = curr_page.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0; uncompressed_data_size += page_data_size; if (auto comp_res = curr_page.comp_res; comp_res != nullptr) { compressed_data_size += comp_res->bytes_written + lvl_bytes; @@ -1920,7 +2313,8 @@ __global__ void __launch_bounds__(128) } uncompressed_page_size = page_g.max_data_size; if (ck_g.is_compressed) { - auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes; + auto const is_v2 = page_g.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0; hdr_start = page_g.compressed_data; compressed_page_size = static_cast(comp_results[blockIdx.x].bytes_written) + lvl_bytes; @@ -2155,6 +2549,10 @@ constexpr __device__ void* align8(void* ptr) return static_cast(ptr) - algn; } +struct mask_tform { + __device__ uint32_t operator()(EncPage const& p) { return static_cast(p.kernel_mask); } +}; + } // namespace // blockDim(1, 1, 1) @@ -2257,8 +2655,9 @@ void InitFragmentStatistics(device_span groups, rmm::cuda_stream_view stream) { int const num_fragments = fragments.size(); - int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size); - gpuInitFragmentStats<<>>(groups, fragments); + int const dim = + util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size); + gpuInitFragmentStats<<>>(groups, fragments); } void InitEncoderPages(device_2dspan chunks, @@ -2277,18 +2676,18 @@ void InitEncoderPages(device_2dspan chunks, { auto num_rowgroups = chunks.size().first; dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup - gpuInitPages<<>>(chunks, - pages, - page_sizes, - comp_page_sizes, - col_desc, - page_grstats, - chunk_grstats, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_align, - write_v2_headers); + gpuInitPages<<>>(chunks, + pages, + page_sizes, + comp_page_sizes, + col_desc, + page_grstats, + chunk_grstats, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_align, + write_v2_headers); } void EncodePages(device_span pages, @@ -2299,10 +2698,43 @@ void EncodePages(device_span pages, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); + + // determine which kernels to invoke + auto mask_iter = thrust::make_transform_iterator(pages.begin(), mask_tform{}); + uint32_t kernel_mask = thrust::reduce( + rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); + + // get the number of streams we need from the pool + int nkernels = std::bitset<32>(kernel_mask).count(); + auto streams = cudf::detail::fork_streams(stream, nkernels); + // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<128><<>>( - pages, comp_in, comp_out, comp_results, write_v2_headers); + + int s_idx = 0; + if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::PLAIN); + gpuEncodePages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY); + gpuEncodeDeltaBinaryPages + <<>>(pages, comp_in, comp_out, comp_results); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DICTIONARY); + gpuEncodeDictPages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + + cudf::detail::join_streams(streams, stream); } void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) @@ -2320,7 +2752,7 @@ void EncodePageHeaders(device_span pages, { // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the // threads to coop load structs - gpuEncodePageHeaders<<>>( + gpuEncodePageHeaders<<>>( pages, comp_results, page_stats, chunk_stats); } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 6a93fec0c46..048f1a73a9c 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -88,6 +88,37 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; +// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits. +using uleb128_t = uint64_t; +using zigzag128_t = int64_t; + +// this is in C++23 +#if !defined(__cpp_lib_is_scoped_enum) +template > +struct is_scoped_enum { + static const bool value = not std::is_convertible_v>; +}; + +template +struct is_scoped_enum { + static const bool value = false; +}; +#else +using std::is_scoped_enum; +#endif + +// helpers to do bit operations on scoped enums +template ::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v)>* = + nullptr> +constexpr uint32_t BitAnd(T1 a, T2 b) +{ + return static_cast(a) & static_cast(b); +} + /** * @brief Enums for the flags in the page header */ @@ -371,6 +402,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding) return 1 << static_cast(encoding); } +/** + * @brief Enum of mask bits for the EncPage kernel_mask + * + * Used to control which encode kernels to run. + */ +enum class encode_kernel_mask { + PLAIN = (1 << 0), // Run plain encoding kernel + DICTIONARY = (1 << 1), // Run dictionary encoding kernel + DELTA_BINARY = (1 << 2) // Run DELTA_BINARY_PACKED encoding kernel +}; + /** * @brief Struct describing an encoder column chunk */ @@ -429,10 +471,11 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) - uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) - compression_result* comp_res; //!< Ptr to compression result - uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) + uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) + compression_result* comp_res; //!< Ptr to compression result + uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run }; /** diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 4733a5d63a8..cd2bc493bc7 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -274,12 +275,13 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { if (!search_key.is_valid(stream)) { - return make_numeric_column(data_type{cudf::type_to_id()}, - lists.size(), - cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), - lists.size(), - stream, - mr); + return make_numeric_column( + data_type{cudf::type_to_id()}, + lists.size(), + cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr), + lists.size(), + stream, + mr); } if (lists.size() == 0) { return make_numeric_column( @@ -337,7 +339,7 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, auto const lists_cv = lists.parent(); auto output = make_numeric_column(data_type{type_to_id()}, lists.size(), - copy_bitmask(lists_cv, stream, mr), + cudf::detail::copy_bitmask(lists_cv, stream, mr), lists_cv.null_count(), stream, mr); diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index c0765b48205..00a2f0bee8f 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -381,7 +381,7 @@ std::unique_ptr column_merger::operator()( // materialize the output buffer rmm::device_buffer validity = lcol.has_nulls() || rcol.has_nulls() - ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) + ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) : rmm::device_buffer{}; if (lcol.has_nulls() || rcol.has_nulls()) { materialize_bitmask(lcol, diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 41cce57d55b..8a6367a1f87 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -219,8 +219,12 @@ std::unique_ptr round_with(column_view const& input, if (decimal_places >= 0 && std::is_integral_v) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); T const n = std::pow(10, std::abs(decimal_places)); @@ -256,8 +260,12 @@ std::unique_ptr round_with(column_view const& input, if (input.type().scale() > -decimal_places) return cudf::detail::cast(input, result_type, stream, mr); - auto result = cudf::make_fixed_width_column( - result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(result_type, + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 85971647434..b8c7d058535 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -42,7 +42,7 @@ struct contains_column_dispatch { stream, mr); return std::make_unique( - std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count()); + std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count()); } }; diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index ee47932100a..f80ace57c69 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -383,7 +383,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in std::move(offsets), std::move(chars->release().children.back()), input.null_count(), - copy_bitmask(input.parent(), stream, mr)); + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 913aec79758..045aac279e6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -290,7 +290,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 52f27c68111..7a0cfb9ef41 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -57,7 +57,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(results), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -72,7 +72,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(strings_child), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -160,7 +160,7 @@ std::unique_ptr whitespace_split_record_fn(strings_column_view const& in std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 1c81f266200..6fa87b1f709 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -194,7 +194,7 @@ std::unique_ptr rescale(column_view input, auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, stream); auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr); if (input.nullable()) { - auto const null_mask = copy_bitmask(input, stream, mr); + auto const null_mask = detail::copy_bitmask(input, stream, mr); output_column->set_null_mask(std::move(null_mask), input.null_count()); } return output_column; @@ -255,7 +255,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -285,7 +285,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -334,7 +334,7 @@ struct dispatch_unary_cast_to { auto output = std::make_unique(cudf::data_type{type.id(), input.type().scale()}, size, rmm::device_buffer{size * cudf::size_of(type), stream}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index d0cae81a9c8..d84e0171b49 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -291,8 +291,12 @@ std::unique_ptr unary_op_with(column_view const& input, std::is_same_v>)) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3e30db7abcb..16e7239ebd8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -629,6 +629,7 @@ ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 2ddb0b76544..0149a467c32 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1957,11 +1957,11 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) // 2 -> (invalid) R"({"b":{"a":[321})" "\n" - // 3 -> c: [1] (valid) + // 3 -> c: 1.2 (valid) R"({"c":1.2})" "\n" "\n" - // 4 -> a: 123 (valid) + // 4 -> a: 4 (valid) R"({"a":4})" "\n" // 5 -> (invalid) @@ -2020,4 +2020,71 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) c_validity.cbegin()}); } +TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) +{ + /** + * @brief Spark has the specific need to ignore extra characters that come after the first record + * on a JSON line + */ + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2}{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid) + R"({"c":1.2 } )" + "\n" + "\n" + // 4 -> (valid) + R"({"a":4} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> (valid) + R"({"a":6} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << data; + } + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 8); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64); + + std::vector a_validity{true, false, false, false, true, true, true, false}; + std::vector b_validity{false, false, true, false, false, false, false, false}; + std::vector c_validity{false, false, false, true, false, false, false, false}; + + // Child column b->a + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(2), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 3cb7e1f287a..5f79d5b862b 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -543,7 +543,7 @@ TEST_F(JsonTest, RecoveringTokenStream) { // Test input. Inline comments used to indicate character indexes // 012345678 <= line 0 - std::string const input = R"({"a":-2},)" + std::string const input = R"({"a":2 {})" // 9 "\n" // 01234 <= line 1 diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index fa85e3a4a1d..2a654bd7e8c 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -353,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { template struct ParquetReaderSourceTest : public ParquetReaderTest {}; +template +struct ParquetWriterDeltaTest : public ParquetWriterTest {}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; @@ -384,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes); class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam {}; // test the allowed bit widths for dictionary encoding -// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, ParquetSizedTest, testing::Range(1, 25), @@ -6698,7 +6700,7 @@ TEST_P(ParquetV2Test, CheckEncodings) // data should be PLAIN for v1, RLE for V2 auto col0_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); - // data should be PLAIN for both + // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2 auto col1_data = random_values(num_rows); // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); @@ -6733,10 +6735,10 @@ TEST_P(ParquetV2Test, CheckEncodings) // col0 should have RLE for rep/def and data EXPECT_TRUE(chunk0_enc.size() == 1); EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); - // col1 should have RLE for rep/def and PLAIN for data + // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data EXPECT_TRUE(chunk1_enc.size() == 2); EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED)); // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data EXPECT_TRUE(chunk2_enc.size() == 3); EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); @@ -6758,6 +6760,104 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly. +// see definition of UnsupportedChronoTypes above. +using DeltaDecimalTypes = cudf::test::Types; +using DeltaBinaryTypes = + cudf::test::Concat; +using SupportedDeltaTestTypes = + cudf::test::RemoveIf, DeltaBinaryTypes>; +TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) +{ + using T = TypeParam; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced) +{ + using T = TypeParam; + constexpr int num_rows = 4'000; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows}); + ASSERT_EQ(expected_slice[0].num_rows(), num_rows); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) +{ + using T = TypeParam; + + constexpr int num_slice = 4'000; + constexpr int num_rows = 32 * 1024; + + std::mt19937 gen(6542); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + // list + constexpr int vals_per_row = 4; + auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_vals( + values, values + (num_rows * vals_per_row), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto _c1 = cudf::make_lists_column( + num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask)); + auto c1 = cudf::purge_nonempty_nulls(*_c1); + + auto const expected = table_view{{*c1}}; + auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice}); + ASSERT_EQ(expected_slice[0].num_rows(), num_slice); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + TEST_F(ParquetWriterTest, EmptyMinStringStatistics) { char const* const min_val = ""; diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp new file mode 100644 index 00000000000..7e59201c8cf --- /dev/null +++ b/cpp/tests/streams/null_mask_test.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class NullMaskTest : public cudf::test::BaseFixture {}; + +TEST_F(NullMaskTest, CreateNullMask) +{ + cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, SetNullMask) +{ + cudf::test::fixed_width_column_wrapper col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::set_null_mask(static_cast(col).null_mask(), + 0, + 3, + false, + cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmask) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask( + static_cast(col).null_mask(), 0, 3, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmaskFromColumn) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask(col, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskAnd) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_and(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskOr) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_or(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, NullCount) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + cudf::null_count( + static_cast(col).null_mask(), 0, 4, cudf::test::get_default_stream()); +} diff --git a/dependencies.yaml b/dependencies.yaml index c19e8765be3..e8114fa5615 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -62,6 +62,7 @@ files: includes: - cudatoolkit - docs + - libarrow_run - py_version py_build_cudf: output: pyproject @@ -225,7 +226,7 @@ dependencies: - &gmock gmock>=1.13.0 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - libarrow==12.0.1.* + - &libarrow libarrow==12.0.0.* - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - nvcomp==2.6.1 @@ -243,7 +244,7 @@ dependencies: - cython>=3.0.0 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - pyarrow==12.0.1.* + - &pyarrow pyarrow==12.0.0.* # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - &numpy numpy>=1.21,<1.25 build_python: @@ -260,16 +261,14 @@ dependencies: - protoc-wheel libarrow_run: common: - - output_types: [conda, requirements] + - output_types: conda packages: - # Allow runtime version to float up to minor version - - libarrow==12.* + - *libarrow pyarrow_run: common: - output_types: [conda, requirements, pyproject] packages: - # Allow runtime version to float up to minor version - - pyarrow==12.* + - *pyarrow cudatoolkit: specific: - output_types: conda @@ -282,6 +281,9 @@ dependencies: - cuda-nvrtc-dev - cuda-nvtx-dev - libcurand-dev + # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates + # cudf_kafka to use scikit-build + - cuda-gdb - matrix: cuda: "11.8" packages: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 39a8dca0267..ccb5d5d4416 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "ninja", "numpy>=1.21,<1.25", "protoc-wheel", - "pyarrow==12.0.1.*", + "pyarrow==12.0.0.*", "rmm==23.12.*", "scikit-build>=0.13.1", "setuptools", @@ -38,7 +38,7 @@ dependencies = [ "pandas>=1.3,<1.6.0dev0", "protobuf>=4.21,<5", "ptxcompiler", - "pyarrow==12.*", + "pyarrow==12.0.0.*", "rmm==23.12.*", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 78a7a83ac3a..ff475e5a72e 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cython>=3.0.0", "numpy>=1.21,<1.25", - "pyarrow==12.0.1.*", + "pyarrow==12.0.0.*", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.