diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 49ca5ca0fb9..9d79733703c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -13,6 +13,7 @@ jobs: # Please keep pr-builder as the top job here pr-builder: needs: + - check-nightly-ci - changed-files - checks - conda-cpp-build @@ -54,6 +55,18 @@ jobs: - name: Telemetry setup if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main + check-nightly-ci: + # Switch to ubuntu-latest once it defaults to a version of Ubuntu that + # provides at least Python 3.11 (see + # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat) + runs-on: ubuntu-24.04 + env: + RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Check if nightly CI is passing + uses: rapidsai/shared-actions/check_nightly_success/dispatch@main + with: + repo: cudf changed-files: secrets: inherit needs: telemetry-setup @@ -328,16 +341,11 @@ jobs: run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" telemetry-summarize: - runs-on: ubuntu-latest + # This job must use a self-hosted runner to record telemetry traces. + runs-on: linux-amd64-cpu4 needs: pr-builder if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - - name: Load stashed telemetry env vars - uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main - with: - load_service_name: true - name: Telemetry summarize - uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main - with: - cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" + uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 890b01e99a8..6c3db891de0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -446,7 +446,6 @@ add_library( src/groupby/sort/group_quantiles.cu src/groupby/sort/group_std.cu src/groupby/sort/group_sum.cu - src/groupby/sort/scan.cpp src/groupby/sort/group_count_scan.cu src/groupby/sort/group_max_scan.cu src/groupby/sort/group_min_scan.cu @@ -454,6 +453,8 @@ add_library( src/groupby/sort/group_rank_scan.cu src/groupby/sort/group_replace_nulls.cu src/groupby/sort/group_sum_scan.cu + src/groupby/sort/host_udf_aggregation.cpp + src/groupby/sort/scan.cpp src/groupby/sort/sort_helper.cu src/hash/md5_hash.cu src/hash/murmurhash3_x86_32.cu diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index c440643037b..b0c48e04710 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,11 +14,6 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") - if(NOT BUILD_SHARED_LIBS) set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) else() @@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow) nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb + GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff deleted file mode 100644 index e9a36fcb567..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h -index caa6be4..70ec8a2 100644 ---- a/src/nanoarrow/common/inline_buffer.h -+++ b/src/nanoarrow/common/inline_buffer.h -@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - } - - static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { -- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | -+ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); -- bits[bytes_begin] &= only_byte_mask; -+ bits[bytes_begin] &= only_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte -- bits[bytes_begin] &= first_byte_mask; -+ bits[bytes_begin] &= first_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { -@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte -- *out_cursor = 0x00; -+ *out_cursor = 0x00; // NOLINT - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json deleted file mode 100644 index d529787e7c8..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "packages" : { - "nanoarrow" : { - "version" : "0.6.0.dev", - "git_url" : "https://github.com/apache/arrow-nanoarrow.git", - "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", - "git_shallow" : false, - "patches" : [ - { - "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", - "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index f5f514d26d9..a1b7db5e08a 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -110,8 +110,9 @@ class aggregation { COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row - PTX, ///< PTX UDF based reduction - CUDA, ///< CUDA UDF based reduction + PTX, ///< PTX based UDF aggregation + CUDA, ///< CUDA based UDF aggregation + HOST_UDF, ///< host based UDF aggregation MERGE_LISTS, ///< merge multiple lists values into one list MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries MERGE_M2, ///< merge partial values of M2 aggregation, @@ -120,7 +121,7 @@ class aggregation { TDIGEST, ///< create a tdigest from a set of input values MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together HISTOGRAM, ///< compute frequency of each element - MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation, + MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation }; aggregation() = delete; @@ -599,6 +600,18 @@ std::unique_ptr make_udf_aggregation(udf_type type, std::string const& user_defined_aggregator, data_type output_type); +// Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation. +struct host_udf_base; + +/** + * @brief Factory to create a HOST_UDF aggregation. + * + * @param host_udf An instance of a class derived from `host_udf_base` to perform aggregation + * @return A HOST_UDF aggregation object + */ +template +std::unique_ptr make_host_udf_aggregation(std::unique_ptr host_udf); + /** * @brief Factory to create a MERGE_LISTS aggregation. * diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp new file mode 100644 index 00000000000..bbce76dc5f3 --- /dev/null +++ b/cpp/include/cudf/aggregation/host_udf.hpp @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +/** + * @file host_udf.hpp + * @brief Declare the base class for host-side user-defined function (`HOST_UDF`) and example of + * subclass implementation. + */ + +namespace CUDF_EXPORT cudf { +/** + * @addtogroup aggregation_factories + * @{ + */ + +/** + * @brief The interface for host-based UDF implementation. + * + * An implementation of host-based UDF needs to be derived from this base class, defining + * its own version of the required functions. In particular: + * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`, + * and `clone` functions. + * - If necessary, the derived class can also override `do_hash` to compute hashing for its + * instance, and `get_required_data` to selectively access to the input data as well as + * intermediate data provided by libcudf. + * + * Example of such implementation: + * @code{.cpp} + * struct my_udf_aggregation : cudf::host_udf_base { + * my_udf_aggregation() = default; + * + * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, + * // and the result from groupby `MAX` aggregation. + * [[nodiscard]] data_attribute_set_t get_required_data() const override + * { + * return {groupby_data_attribute::GROUPED_VALUES, + * groupby_data_attribute::GROUP_OFFSETS, + * cudf::make_max_aggregation()}; + * } + * + * [[nodiscard]] output_t get_empty_output( + * [[maybe_unused]] std::optional output_dtype, + * [[maybe_unused]] rmm::cuda_stream_view stream, + * [[maybe_unused]] rmm::device_async_resource_ref mr) const override + * { + * // This UDF aggregation always returns a column of type INT32. + * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + * } + * + * [[nodiscard]] output_t operator()(input_map_t const& input, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform UDF computation using the input data and return the result. + * } + * + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override + * { + * // Check if the other object is also instance of this class. + * return dynamic_cast(&other) != nullptr; + * } + * + * [[nodiscard]] std::unique_ptr clone() const override + * { + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct host_udf_base { + host_udf_base() = default; + virtual ~host_udf_base() = default; + + /** + * @brief Define the possible data needed for groupby aggregations. + * + * Note that only sort-based groupby aggregations are supported. + */ + enum class groupby_data_attribute : int32_t { + INPUT_VALUES, ///< The input values column. + GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the + ///< values within each group maintain their original order. + SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and + ///< sorted within each group. + NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys). + GROUP_OFFSETS, ///< The offsets separating groups. + GROUP_LABELS ///< Group labels (which is also the same as group indices). + }; + + /** + * @brief Describe possible data that may be needed in the derived class for its operations. + * + * Such data can be either intermediate data such as sorted values or group labels etc, or the + * results of other aggregations. + * + * Each derived host-based UDF class may need a different set of data. It is inefficient to + * evaluate and pass down all these possible data at once from libcudf. A solution for that is, + * the derived class can define a subset of data that it needs and libcudf will evaluate + * and pass down only data requested from that set. + */ + struct data_attribute { + /** + * @brief Hold all possible data types for the input of the aggregation in the derived class. + */ + using value_type = std::variant>; + value_type value; ///< The actual data attribute, wrapped by this struct + ///< as a wrapper is needed to define `hash` and `equal_to` functors. + + data_attribute() = default; ///< Default constructor + data_attribute(data_attribute&&) = default; ///< Move constructor + + /** + * @brief Construct a new data attribute from an aggregation attribute. + * @param value_ An aggregation attribute + */ + template )> + data_attribute(T value_) : value{value_} + { + } + + /** + * @brief Construct a new data attribute from another aggregation request. + * @param value_ An aggregation request + */ + template || + std::is_same_v)> + data_attribute(std::unique_ptr value_) : value{std::move(value_)} + { + CUDF_EXPECTS(std::get>(value) != nullptr, + "Invalid aggregation request."); + if constexpr (std::is_same_v) { + CUDF_EXPECTS( + dynamic_cast(std::get>(value).get()) != nullptr, + "Requesting results from other aggregations is only supported in groupby " + "aggregations."); + } + } + + /** + * @brief Copy constructor. + * @param other The other data attribute to copy from + */ + data_attribute(data_attribute const& other); + + /** + * @brief Hash functor for `data_attribute`. + */ + struct hash { + /** + * @brief Compute the hash value of a data attribute. + * @param attr The data attribute to hash + * @return The hash value of the data attribute + */ + std::size_t operator()(data_attribute const& attr) const; + }; // struct hash + + /** + * @brief Equality comparison functor for `data_attribute`. + */ + struct equal_to { + /** + * @brief Check if two data attributes are equal. + * @param lhs The left-hand side data attribute + * @param rhs The right-hand side data attribute + * @return True if the two data attributes are equal + */ + bool operator()(data_attribute const& lhs, data_attribute const& rhs) const; + }; // struct equal_to + }; // struct data_attribute + + /** + * @brief Set of attributes for the input data that is needed for computing the aggregation. + */ + using data_attribute_set_t = + std::unordered_set; + + /** + * @brief Return a set of attributes for the data that is needed for computing the aggregation. + * + * The derived class should return the attributes corresponding to only the data that it needs to + * avoid unnecessary computation performed in libcudf. If this function is not overridden, an + * empty set is returned. That means all the data attributes (except results from other + * aggregations in groupby) will be needed. + * + * @return A set of `data_attribute` + */ + [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; } + + /** + * @brief Hold all possible types of the data that is passed to the derived class for executing + * the aggregation. + */ + using input_data_t = std::variant>; + + /** + * @brief Input to the aggregation, mapping from each data attribute to its actual data. + */ + using input_map_t = std:: + unordered_map; + + /** + * @brief Output type of the aggregation. + * + * Currently only a single type is supported as the output of the aggregation, but it will hold + * more type in the future when reduction is supported. + */ + using output_t = std::variant>; + + /** + * @brief Get the output when the input values column is empty. + * + * This is called in libcudf when the input values column is empty. In such situations libcudf + * tries to generate the output directly without unnecessarily evaluating the intermediate data. + * + * @param output_dtype The expected output data type + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation when input values is empty + */ + [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; + + /** + * @brief Perform the main computation for the host-based UDF. + * + * @param input The input data needed for performing all computation + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation + */ + [[nodiscard]] virtual output_t operator()(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; + + /** + * @brief Computes hash value of the class's instance. + * @return The hash value of the instance + */ + [[nodiscard]] virtual std::size_t do_hash() const + { + return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + } + + /** + * @brief Compares two instances of the derived class for equality. + * @param other The other derived class's instance to compare with + * @return True if the two instances are equal + */ + [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + + /** + * @brief Clones the instance. + * + * A class derived from `host_udf_base` should not store too much data such that its instances + * remain lightweight for efficient cloning. + * + * @return A new instance cloned from this + */ + [[nodiscard]] virtual std::unique_ptr clone() const = 0; +}; + +/** @} */ // end of group +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index c30c3d6f4bd..59011f7b138 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -31,7 +32,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -216,12 +216,12 @@ struct identity_initializer { * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. - * @param aggs A vector of aggregation operations corresponding to the table + * @param aggs A span of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 6661a461b8b..d873e93bd20 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class lead_lag_aggregation const& agg); virtual std::vector> visit(data_type col_type, class udf_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class host_udf_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_lists_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -135,6 +138,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class collect_set_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); virtual void visit(class udf_aggregation const& agg); + virtual void visit(class host_udf_aggregation const& agg); virtual void visit(class merge_lists_aggregation const& agg); virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class merge_m2_aggregation const& agg); @@ -960,6 +964,35 @@ class udf_aggregation final : public rolling_aggregation { } }; +/** + * @brief Derived class for specifying host-based UDF aggregation. + */ +class host_udf_aggregation final : public groupby_aggregation { + public: + std::unique_ptr udf_ptr; + + host_udf_aggregation() = delete; + host_udf_aggregation(host_udf_aggregation const&) = delete; + + // Need to define the constructor and destructor in a separate source file where we have the + // complete declaration of `host_udf_base`. + explicit host_udf_aggregation(std::unique_ptr udf_ptr_); + ~host_udf_aggregation() override; + + [[nodiscard]] bool is_equal(aggregation const& _other) const override; + + [[nodiscard]] size_t do_hash() const override; + + [[nodiscard]] std::unique_ptr clone() const override; + + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived aggregation class for specifying MERGE_LISTS aggregation */ @@ -1462,6 +1495,12 @@ struct target_type_impl +struct target_type_impl { + // Just a placeholder. The actual return type is unknown. + using type = struct_view; +}; + /** * @brief Helper alias to get the accumulator type for performing aggregation * `k` on elements of type `Source` @@ -1579,6 +1618,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::EWMA: return f.template operator()(std::forward(args)...); + case aggregation::HOST_UDF: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh index e0c7ce840d7..69edf38e359 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -57,62 +57,71 @@ struct MurmurHash3_x86_32 { }; template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(bool const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + bool const& key) const { return this->compute(static_cast(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(float const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + float const& key) const { return this->compute(normalize_nans_and_zeros(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()(double const& key) const +MurmurHash3_x86_32::result_type __device__ inline MurmurHash3_x86_32::operator()( + double const& key) const { return this->compute(normalize_nans_and_zeros(key)); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::string_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::string_view const& key) const { return this->compute_bytes(reinterpret_cast(key.data()), key.size_bytes()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal32 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal32 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal64 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal64 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - numeric::decimal128 const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + numeric::decimal128 const& key) const { return this->compute(key.value()); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::list_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::list_view const& key) const { CUDF_UNREACHABLE("List column hashing is not supported"); } template <> -hash_value_type __device__ inline MurmurHash3_x86_32::operator()( - cudf::struct_view const& key) const +MurmurHash3_x86_32::result_type + __device__ inline MurmurHash3_x86_32::operator()( + cudf::struct_view const& key) const { CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); } diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp index 0d74a4158ad..4ad760d278f 100644 --- a/cpp/include/cudf/io/nvcomp_adapter.hpp +++ b/cpp/include/cudf/io/nvcomp_adapter.hpp @@ -22,7 +22,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::nvcomp { +namespace io::detail::nvcomp { enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP }; @@ -88,5 +88,5 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para [[nodiscard]] std::optional is_decompression_disabled( compression_type compression, feature_status_parameters params = feature_status_parameters()); -} // namespace io::nvcomp +} // namespace io::detail::nvcomp } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index a60a7f63882..0d4400b891b 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -237,6 +237,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, host_udf_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + // aggregation_finalizer ---------------------------------------- void aggregation_finalizer::visit(aggregation const& agg) {} @@ -410,6 +416,11 @@ void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(host_udf_aggregation const& agg) +{ + visit(static_cast(agg)); +} + } // namespace detail std::vector> aggregation::get_simple_aggregations( diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index d915c85bf85..3a6ff36c424 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -17,15 +17,14 @@ #include #include #include +#include #include -#include - namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream) { // TODO: Initialize all the columns in a single kernel instead of invoking one diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index c42038026e5..4c90cd0eef5 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -32,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -99,6 +99,8 @@ namespace { struct empty_column_constructor { column_view values; aggregation const& agg; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; template std::unique_ptr operator()() const @@ -108,7 +110,7 @@ struct empty_column_constructor { if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) { return make_lists_column( - 0, make_empty_column(type_to_id()), empty_like(values), 0, {}); + 0, make_empty_column(type_to_id()), empty_like(values), 0, {}, stream, mr); } if constexpr (k == aggregation::Kind::HISTOGRAM) { @@ -116,7 +118,9 @@ struct empty_column_constructor { make_empty_column(type_to_id()), cudf::reduction::detail::make_empty_histogram_like(values), 0, - {}); + {}, + stream, + mr); } if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); } @@ -140,31 +144,41 @@ struct empty_column_constructor { return empty_like(values); } + if constexpr (k == aggregation::Kind::HOST_UDF) { + auto const& udf_ptr = dynamic_cast(agg).udf_ptr; + return std::get>(udf_ptr->get_empty_output(std::nullopt, stream, mr)); + } + return make_empty_column(target_type(values.type(), k)); } }; /// Make an empty table with appropriate types for requested aggs template -auto empty_results(host_span requests) +auto empty_results(host_span requests, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { std::vector empty_results; - std::transform( - requests.begin(), requests.end(), std::back_inserter(empty_results), [](auto const& request) { - std::vector> results; - - std::transform( - request.aggregations.begin(), - request.aggregations.end(), - std::back_inserter(results), - [&request](auto const& agg) { - return cudf::detail::dispatch_type_and_aggregation( - request.values.type(), agg->kind, empty_column_constructor{request.values, *agg}); - }); - - return aggregation_result{std::move(results)}; - }); + std::transform(requests.begin(), + requests.end(), + std::back_inserter(empty_results), + [stream, mr](auto const& request) { + std::vector> results; + + std::transform(request.aggregations.begin(), + request.aggregations.end(), + std::back_inserter(results), + [&request, stream, mr](auto const& agg) { + return cudf::detail::dispatch_type_and_aggregation( + request.values.type(), + agg->kind, + empty_column_constructor{request.values, *agg, stream, mr}); + }); + + return aggregation_result{std::move(results)}; + }); return empty_results; } @@ -206,7 +220,7 @@ std::pair, std::vector> groupby::aggr verify_valid_requests(requests); - if (_keys.num_rows() == 0) { return {empty_like(_keys), empty_results(requests)}; } + if (_keys.num_rows() == 0) { return {empty_like(_keys), empty_results(requests, stream, mr)}; } return dispatch_aggregation(requests, stream, mr); } @@ -226,7 +240,9 @@ std::pair, std::vector> groupby::scan verify_valid_requests(requests); - if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } + if (_keys.num_rows() == 0) { + return std::pair(empty_like(_keys), empty_results(requests, stream, mr)); + } return sort_scan(requests, stream, mr); } diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index e8b29a0e7a8..9c9a4c97bff 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -60,7 +60,7 @@ rmm::device_uvector compute_aggregations( rmm::cuda_stream_view stream) { // flatten the aggs to a table that can be operated on by aggregate_row - auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream); auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index 6025686953e..d2830f7d905 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 00db149c6d9..671ee2ea31f 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 0777b9ffd93..437823a3fea 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 209e2b7f20a..7cb3f8f190b 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, nullable_global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index e1dbf2a3d9e..9648d942513 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -61,7 +61,7 @@ std::unique_ptr compute_groupby(table_view const& keys, d_row_equal, probing_scheme_t{d_row_hash}, cuco::thread_scope_device, - cuco::storage{}, + cuco::storage{}, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index d353830780f..f86a93109be 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; // Shared set initialization - __shared__ cuco::window windows[window_extent.value()]; + __shared__ cuco::bucket buckets[bucket_extent.value()]; auto raw_set = cuco::static_set_ref{ cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, global_set.key_eq(), probing_scheme_t{global_set.hash_function()}, cuco::thread_scope_block, - cuco::aow_storage_ref{ - window_extent, windows}}; + cuco::bucket_storage_ref{ + bucket_extent, buckets}}; auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); auto const block = cooperative_groups::this_thread_block(); diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index bc32e306b3f..a835736235c 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, @@ -107,7 +108,7 @@ template void extract_populated_keys( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, global_set_t const& global_set, rmm::device_uvector& populated_keys, @@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, nullable_global_set_t const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index 8155ce852e0..4e2fa81bdb7 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -20,12 +20,11 @@ #include #include #include +#include #include #include -#include - namespace cudf::groupby::detail::hash { /** * @brief Computes and returns a device vector containing all populated keys in @@ -47,7 +46,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp index b2048a9fbb8..a533f7a6448 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -102,12 +103,15 @@ class groupby_simple_aggregations_collector final }; // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; - std::vector agg_kinds; + auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); for (auto const& request : requests) { auto const& agg_v = request.aggregations; diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp index dfad51f27d4..e3c17ca972c 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -26,7 +26,10 @@ namespace cudf::groupby::detail::hash { // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests); +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 37a61c1a22c..b71e20938d6 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -170,7 +170,8 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation c cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + cudf::detail::initialize_with_identity( + var_table_view, host_span(&agg.kind, 1), stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index f950e03e0fb..92925e11bac 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -27,7 +27,7 @@ namespace cudf::groupby::detail::hash { CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; /// Number of slots per thread -CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; +CUDF_HOST_DEVICE auto constexpr GROUPBY_BUCKET_SIZE = 1; /// Thread block size CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; @@ -48,9 +48,9 @@ using shmem_extent_t = cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; -/// Number of windows needed by each shared memory hash set -CUDF_HOST_DEVICE auto constexpr window_extent = - cuco::make_window_extent(shmem_extent_t{}); +/// Number of buckets needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr bucket_extent = + cuco::make_bucket_extent(shmem_extent_t{}); using row_hash_t = cudf::experimental::row::hash::device_row_hasher, - cuco::storage>; + cuco::storage>; using nullable_global_set_t = cuco::static_set, @@ -83,7 +83,7 @@ using nullable_global_set_t = cuco::static_set, - cuco::storage>; + cuco::storage>; template using hash_set_ref_t = cuco::static_set_ref< @@ -91,7 +91,7 @@ using hash_set_ref_t = cuco::static_set_ref< cuda::thread_scope_device, row_comparator_t, probing_scheme_t, - cuco::aow_storage_ref>, + cuco::bucket_storage_ref>, Op>; template @@ -100,6 +100,6 @@ using nullable_hash_set_ref_t = cuco::static_set_ref< cuda::thread_scope_device, nullable_row_comparator_t, probing_scheme_t, - cuco::aow_storage_ref>, + cuco::bucket_storage_ref>, Op>; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 7a8a1883ed4..6480070e85a 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -19,6 +19,7 @@ #include "groupby/sort/group_reductions.hpp" #include +#include #include #include #include @@ -208,10 +209,7 @@ void aggregate_result_functor::operator()(aggregation const& a operator()(*argmin_agg); column_view const argmin_result = cache.get_result(values, *argmin_agg); - // We make a view of ARGMIN result without a null mask and gather using - // this mask. The values in data buffer of ARGMIN result corresponding - // to null values was initialized to ARGMIN_SENTINEL which is an out of - // bounds index value and causes the gathered value to be null. + // Compute the ARGMIN result without the null mask in the gather map. column_view const null_removed_map( data_type(type_to_id()), argmin_result.size(), @@ -250,10 +248,7 @@ void aggregate_result_functor::operator()(aggregation const& a operator()(*argmax_agg); column_view const argmax_result = cache.get_result(values, *argmax_agg); - // We make a view of ARGMAX result without a null mask and gather using - // this mask. The values in data buffer of ARGMAX result corresponding - // to null values was initialized to ARGMAX_SENTINEL which is an out of - // bounds index value and causes the gathered value to be null. + // Compute the ARGMAX result without the null mask in the gather map. column_view const null_removed_map( data_type(type_to_id()), argmax_result.size(), @@ -795,6 +790,65 @@ void aggregate_result_functor::operator()(aggregatio mr)); } +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(values, agg)) { return; } + + auto const& udf_ptr = dynamic_cast(agg).udf_ptr; + auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t { + if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; } + // Empty attribute set means everything. + return {host_udf_base::groupby_data_attribute::INPUT_VALUES, + host_udf_base::groupby_data_attribute::GROUPED_VALUES, + host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, + host_udf_base::groupby_data_attribute::NUM_GROUPS, + host_udf_base::groupby_data_attribute::GROUP_OFFSETS, + host_udf_base::groupby_data_attribute::GROUP_LABELS}; + }(); + + // Do not cache udf_input, as the actual input data may change from run to run. + host_udf_base::input_map_t udf_input; + for (auto const& attr : data_attrs) { + CUDF_EXPECTS(std::holds_alternative(attr.value) || + std::holds_alternative>(attr.value), + "Invalid input data attribute for HOST_UDF groupby aggregation."); + if (std::holds_alternative(attr.value)) { + switch (std::get(attr.value)) { + case host_udf_base::groupby_data_attribute::INPUT_VALUES: + udf_input.emplace(attr, values); + break; + case host_udf_base::groupby_data_attribute::GROUPED_VALUES: + udf_input.emplace(attr, get_grouped_values()); + break; + case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES: + udf_input.emplace(attr, get_sorted_values()); + break; + case host_udf_base::groupby_data_attribute::NUM_GROUPS: + udf_input.emplace(attr, helper.num_groups(stream)); + break; + case host_udf_base::groupby_data_attribute::GROUP_OFFSETS: + udf_input.emplace(attr, helper.group_offsets(stream)); + break; + case host_udf_base::groupby_data_attribute::GROUP_LABELS: + udf_input.emplace(attr, helper.group_labels(stream)); + break; + default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation."); + } + } else { // data is result from another aggregation + auto other_agg = std::get>(attr.value)->clone(); + cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg); + auto result = cache.get_result(values, *other_agg); + udf_input.emplace(std::move(other_agg), std::move(result)); + } + } + + auto output = (*udf_ptr)(udf_input, stream, mr); + CUDF_EXPECTS(std::holds_alternative>(output), + "Invalid output type from HOST_UDF groupby aggregation."); + cache.add_result(values, agg, std::get>(std::move(output))); +} + } // namespace detail // Sort-based groupby diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index 7dce341130e..329c7c4eb32 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -42,22 +42,21 @@ std::unique_ptr group_argmax(column_view const& values, stream, mr); - // The functor returns the index of maximum in the sorted values. - // We need the index of maximum in the original unsorted values. - // So use indices to gather the sort order used to sort `values`. - // Gather map cannot be null so we make a view with the mask removed. - // The values in data buffer of indices corresponding to null values was - // initialized to ARGMAX_SENTINEL. Using gather_if. - // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL. - auto indices_view = indices->mutable_view(); - thrust::gather_if(rmm::exec_policy(stream), - indices_view.begin(), // map first - indices_view.end(), // map last - indices_view.begin(), // stencil - key_sort_order.begin(), // input - indices_view.begin(), // result - [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); }); - return indices; + // The functor returns the indices of maximums based on the sorted keys. + // We need the indices of maximums from the original unsorted keys + // so we use these indices and the key_sort_order to map to the correct indices. + // We do not use cudf::gather since we can move the null-mask separately. + auto indices_view = indices->view(); + auto output = rmm::device_uvector(indices_view.size(), stream, mr); + thrust::gather(rmm::exec_policy_nosync(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + key_sort_order.begin(), // input + output.data() // result (must not overlap map) + ); + auto null_count = indices_view.null_count(); + auto null_mask = indices->release().null_mask.release(); + return std::make_unique(std::move(output), std::move(*null_mask), null_count); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index c4bed330b9f..dbfc375fc20 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -21,6 +21,7 @@ #include #include +#include #include @@ -42,22 +43,21 @@ std::unique_ptr group_argmin(column_view const& values, stream, mr); - // The functor returns the index of minimum in the sorted values. - // We need the index of minimum in the original unsorted values. - // So use indices to gather the sort order used to sort `values`. - // The values in data buffer of indices corresponding to null values was - // initialized to ARGMIN_SENTINEL. Using gather_if. - // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL. - auto indices_view = indices->mutable_view(); - thrust::gather_if(rmm::exec_policy(stream), - indices_view.begin(), // map first - indices_view.end(), // map last - indices_view.begin(), // stencil - key_sort_order.begin(), // input - indices_view.begin(), // result - [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); }); - - return indices; + // The functor returns the indices of minimums based on the sorted keys. + // We need the indices of minimums from the original unsorted keys + // so we use these and the key_sort_order to map to the correct indices. + // We do not use cudf::gather since we can move the null-mask separately. + auto indices_view = indices->view(); + auto output = rmm::device_uvector(indices_view.size(), stream, mr); + thrust::gather(rmm::exec_policy_nosync(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + key_sort_order.begin(), // input + output.data() // result (must not overlap map) + ); + auto null_count = indices_view.null_count(); + auto null_mask = indices->release().null_mask.release(); + return std::make_unique(std::move(output), std::move(*null_mask), null_count); } } // namespace detail diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index 86835ea8a67..5082ad01327 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -107,7 +107,10 @@ struct group_scan_functor() if (values.is_empty()) { return result; } auto result_table = mutable_table_view({*result}); - cudf::detail::initialize_with_identity(result_table, {K}, stream); + // Need an address of the aggregation kind to pass to the span + auto const kind = K; + cudf::detail::initialize_with_identity( + result_table, host_span(&kind, 1), stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); auto values_view = column_device_view::create(values, stream); diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp new file mode 100644 index 00000000000..0da47e17f48 --- /dev/null +++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cudf { + +host_udf_base::data_attribute::data_attribute(data_attribute const& other) + : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; }, + [](std::unique_ptr const& val) { + return value_type{val->clone()}; + }}, + other.value)} +{ +} + +std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const +{ + auto const hash_value = + std::visit(cudf::detail::visitor_overload{ + [](auto const& val) { return std::hash{}(static_cast(val)); }, + [](std::unique_ptr const& val) { return val->do_hash(); }}, + attr.value); + return std::hash{}(attr.value.index()) ^ hash_value; +} + +bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs, + data_attribute const& rhs) const +{ + auto const& lhs_val = lhs.value; + auto const& rhs_val = rhs.value; + if (lhs_val.index() != rhs_val.index()) { return false; } + return std::visit( + cudf::detail::visitor_overload{ + [](auto const& lhs_val, auto const& rhs_val) { + if constexpr (std::is_same_v) { + return lhs_val == rhs_val; + } else { + return false; + } + }, + [](std::unique_ptr const& lhs_val, std::unique_ptr const& rhs_val) { + return lhs_val->is_equal(*rhs_val); + }}, + lhs_val, + rhs_val); +} + +namespace detail { + +host_udf_aggregation::host_udf_aggregation(std::unique_ptr udf_ptr_) + : aggregation{HOST_UDF}, udf_ptr{std::move(udf_ptr_)} +{ + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid host_udf_base instance."); +} + +host_udf_aggregation::~host_udf_aggregation() = default; + +bool host_udf_aggregation::is_equal(aggregation const& _other) const +{ + if (!this->aggregation::is_equal(_other)) { return false; } + auto const& other = dynamic_cast(_other); + return udf_ptr->is_equal(*other.udf_ptr); +} + +size_t host_udf_aggregation::do_hash() const +{ + return this->aggregation::do_hash() ^ udf_ptr->do_hash(); +} + +std::unique_ptr host_udf_aggregation::clone() const +{ + return std::make_unique(udf_ptr->clone()); +} + +} // namespace detail + +template +std::unique_ptr make_host_udf_aggregation(std::unique_ptr udf_ptr_) +{ + return std::make_unique(std::move(udf_ptr_)); +} +template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation( + std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); + +} // namespace cudf diff --git a/cpp/src/io/comp/common.hpp b/cpp/src/io/comp/common.hpp new file mode 100644 index 00000000000..a81ac60e03a --- /dev/null +++ b/cpp/src/io/comp/common.hpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::io::detail { + +/** + * @brief The size used for padding a data buffer's size to a multiple of the padding. + * + * Padding is necessary for input/output buffers of several compression/decompression kernels + * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require + * padding to the buffers so that the pointers can shift along the address space to satisfy their + * alignment requirement. + * + * In the meantime, it is not entirely clear why such padding is needed. We need to further + * investigate and implement a better fix rather than just padding the buffer. + * See https://github.com/rapidsai/cudf/issues/13605. + */ +constexpr std::size_t BUFFER_PADDING_MULTIPLE{8}; + +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp index 2dda2287e09..26535bed43b 100644 --- a/cpp/src/io/comp/comp.cpp +++ b/cpp/src/io/comp/comp.cpp @@ -87,15 +87,14 @@ std::vector compress_snappy(host_span src, outputs[0] = d_dst; outputs.host_to_device_async(stream); - cudf::detail::hostdevice_vector hd_status(1, stream); + cudf::detail::hostdevice_vector hd_status(1, stream); hd_status[0] = {}; hd_status.host_to_device_async(stream); nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, inputs, outputs, hd_status, stream); hd_status.device_to_host_sync(stream); - CUDF_EXPECTS(hd_status[0].status == cudf::io::compression_status::SUCCESS, - "snappy compression failed"); + CUDF_EXPECTS(hd_status[0].status == compression_status::SUCCESS, "snappy compression failed"); return cudf::detail::make_std_vector_sync(d_dst, stream); } diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp index 652abbbeda6..e16f26e1f06 100644 --- a/cpp/src/io/comp/comp.hpp +++ b/cpp/src/io/comp/comp.hpp @@ -16,16 +16,34 @@ #pragma once +#include "common.hpp" + #include #include -#include -#include #include namespace CUDF_EXPORT cudf { namespace io::detail { +/** + * @brief Status of a compression/decompression operation. + */ +enum class compression_status : uint8_t { + SUCCESS, ///< Successful, output is valid + FAILURE, ///< Failed, output is invalid (e.g. input is unsupported in some way) + SKIPPED, ///< Operation skipped (if conversion, uncompressed data can be used) + OUTPUT_OVERFLOW, ///< Output buffer is too small; operation can succeed with larger output +}; + +/** + * @brief Descriptor of compression/decompression result. + */ +struct compression_result { + uint64_t bytes_written; + compression_status status; +}; + /** * @brief Compresses a system memory buffer. * diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 72649dbe427..151f72d262e 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -63,8 +63,8 @@ THE SOFTWARE. #include -namespace cudf { -namespace io { +namespace cudf::io::detail { + constexpr uint32_t huffman_lookup_table_width = 8; constexpr int8_t brotli_code_length_codes = 18; constexpr uint32_t brotli_num_distance_short_codes = 16; @@ -2020,7 +2020,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2) results[block_id].status = (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; // Return ext heap used by last block (statistics) - results[block_id].reserved = s->fb_size; } } @@ -2115,5 +2114,4 @@ void gpu_debrotli(device_span const> inputs, #endif } -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 090ea1430b5..6e5ce4ce6c3 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -49,8 +49,7 @@ Mark Adler madler@alumni.caltech.edu #include -namespace cudf { -namespace io { +namespace cudf::io::detail { constexpr int max_bits = 15; // maximum bits in a code constexpr int max_l_codes = 286; // maximum number of literal/length codes @@ -1139,7 +1138,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) default: return compression_status::FAILURE; } }(); - results[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } @@ -1224,5 +1222,4 @@ void gpu_copy_uncompressed_blocks(device_span const> } } -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index 8bfca2b30df..4b09bd5a84c 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -16,6 +16,8 @@ #pragma once +#include "io/comp/comp.hpp" + #include #include #include @@ -24,44 +26,10 @@ #include -namespace cudf { -namespace io { - -/** - * @brief Status of a compression/decompression operation. - */ -enum class compression_status : uint8_t { - SUCCESS, ///< Successful, output is valid - FAILURE, ///< Failed, output is invalid (e.g. input is unsupported in some way) - SKIPPED, ///< Operation skipped (if conversion, uncompressed data can be used) - OUTPUT_OVERFLOW, ///< Output buffer is too small; operation can succeed with larger output -}; - -/** - * @brief Descriptor of compression/decompression result. - */ -struct compression_result { - uint64_t bytes_written; - compression_status status; - uint32_t reserved; -}; +namespace cudf::io::detail { enum class gzip_header_included { NO, YES }; -/** - * @brief The value used for padding a data buffer such that its size will be multiple of it. - * - * Padding is necessary for input/output buffers of several compression/decompression kernels - * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require - * padding to the buffers so that the pointers can shift along the address space to satisfy their - * alignment requirement. - * - * In the meantime, it is not entirely clear why such padding is needed. We need to further - * investigate and implement a better fix rather than just padding the buffer. - * See https://github.com/rapidsai/cudf/issues/13605. - */ -constexpr std::size_t BUFFER_PADDING_MULTIPLE{8}; - /** * @brief Interface for decompressing GZIP-compressed data * @@ -169,5 +137,4 @@ void gpu_snap(device_span const> inputs, device_span results, rmm::cuda_stream_view stream); -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/io_uncomp.hpp b/cpp/src/io/comp/io_uncomp.hpp index ca722a9b7ee..711a1c3274f 100644 --- a/cpp/src/io/comp/io_uncomp.hpp +++ b/cpp/src/io/comp/io_uncomp.hpp @@ -16,15 +16,13 @@ #pragma once +#include "common.hpp" + #include #include -#include -#include #include -using cudf::host_span; - namespace CUDF_EXPORT cudf { namespace io::detail { diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index d45c02f374f..3a4e315348c 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -30,7 +30,7 @@ #include -namespace cudf::io::nvcomp { +namespace cudf::io::detail::nvcomp { namespace { // Dispatcher for nvcompBatchedDecompressGetTempSizeEx @@ -478,4 +478,4 @@ std::optional compress_max_allowed_chunk_size(compression_type compressi } } -} // namespace cudf::io::nvcomp +} // namespace cudf::io::detail::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 794d452ebf2..cf5996dfd93 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,7 +23,7 @@ #include #include -namespace cudf::io::nvcomp { +namespace cudf::io::detail::nvcomp { batched_args create_batched_nvcomp_args(device_span const> inputs, device_span const> outputs, @@ -127,4 +127,4 @@ std::pair max_chunk_and_total_input_size(device_span @@ -27,7 +27,7 @@ #include -namespace cudf::io::nvcomp { +namespace cudf::io::detail::nvcomp { struct batched_args { rmm::device_uvector input_data_ptrs; @@ -76,4 +76,4 @@ void skip_unsupported_inputs(device_span input_sizes, std::pair max_chunk_and_total_input_size(device_span input_sizes, rmm::cuda_stream_view stream); -} // namespace cudf::io::nvcomp +} // namespace cudf::io::detail::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 2e1cda2d6b7..5c402523168 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -16,7 +16,7 @@ #pragma once -#include "gpuinflate.hpp" +#include "io/comp/comp.hpp" #include #include @@ -25,7 +25,7 @@ #include -namespace cudf::io::nvcomp { +namespace cudf::io::detail::nvcomp { /** * @brief Device batch decompression of given type. * @@ -103,4 +103,4 @@ void batched_compress(compression_type compression, device_span results, rmm::cuda_stream_view stream); -} // namespace cudf::io::nvcomp +} // namespace cudf::io::detail::nvcomp diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 7d4dcffa713..1443bfd38a2 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -19,8 +19,7 @@ #include -namespace cudf { -namespace io { +namespace cudf::io::detail { constexpr int hash_bits = 12; // TBD: Tentatively limits to 2-byte codes to prevent long copy search followed by long literal @@ -329,7 +328,6 @@ CUDF_KERNEL void __launch_bounds__(128) results[blockIdx.x].bytes_written = s->dst - s->dst_base; results[blockIdx.x].status = (s->dst > s->end) ? compression_status::FAILURE : compression_status::SUCCESS; - results[blockIdx.x].reserved = 0; } } @@ -345,5 +343,4 @@ void gpu_snap(device_span const> inputs, } } -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu index faf967041bc..caee9145d2c 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/statistics.cu @@ -21,7 +21,7 @@ #include #include -namespace cudf::io { +namespace cudf::io::detail { writer_compression_statistics collect_compression_statistics( device_span const> inputs, @@ -61,4 +61,4 @@ writer_compression_statistics collect_compression_statistics( output_size_successful}; } -} // namespace cudf::io +} // namespace cudf::io::detail diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 9b01272ac70..cf841c435a3 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -21,12 +21,10 @@ #include -namespace cudf { -namespace io { +namespace cudf::io::detail { constexpr int32_t batch_size = (1 << 5); constexpr int32_t batch_count = (1 << 2); constexpr int32_t prefetch_size = (1 << 9); // 512B, in 32B chunks -constexpr bool log_cyclecount = false; void __device__ busy_wait(size_t cycles) { @@ -647,7 +645,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto cur = s->src.begin(); auto const end = s->src.end(); s->error = 0; - if (log_cyclecount) { s->tstart = clock(); } if (cur < end) { // Read uncompressed size (varint), limited to 32-bit uint32_t uncompressed_size = *cur++; @@ -705,11 +702,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) results[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; results[strm_id].status = (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE; - if (log_cyclecount) { - results[strm_id].reserved = clock() - s->tstart; - } else { - results[strm_id].reserved = 0; - } } } @@ -724,5 +716,4 @@ void gpu_unsnap(device_span const> inputs, unsnap_kernel<128><<>>(inputs, outputs, results); } -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 0f1fc7d572b..98641f2c893 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, stream)); } + // Check if the last element of d_kv_operations is 0. If not, then we have a problem. + if (num_symbols_in && !supports_reset_op) { + StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream); + CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!"); + } + // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ reinterpret_cast(d_kv_operations.Current()), diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index f1c2826c62a..30a28a1cf98 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1473,10 +1473,11 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - auto stack_ops_bufsize = d_num_stack_ops.value(stream); + // Copy back to actual number of stack operations + auto num_stack_ops = d_num_stack_ops.value(stream); // Sequence of stack symbols and their position in the original input (sparse representation) - rmm::device_uvector stack_ops{stack_ops_bufsize, stream}; - rmm::device_uvector stack_op_indices{stack_ops_bufsize, stream}; + rmm::device_uvector stack_ops{num_stack_ops, stream}; + rmm::device_uvector stack_op_indices{num_stack_ops, stream}; // Run bracket-brace FST to retrieve starting positions of structs and lists json_to_stack_ops_fst.Transduce(json_in.begin(), @@ -1487,9 +1488,6 @@ void get_stack_context(device_span json_in, to_stack_op::start_state, stream); - // Copy back to actual number of stack operations - auto const num_stack_ops = d_num_stack_ops.value(stream); - // Stack operations with indices are converted to top of the stack for each character in the input if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { fst::sparse_stack_op_to_top_of_stack( diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 6dc3a9d793c..55a0ff7972f 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -181,9 +181,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) for (size_type i = 0; i < dict.map_slots.size(); i += block_size) { if (t + i < dict.map_slots.size()) { - auto window = dict.map_slots.begin() + t + i; - // Collect all slots from each window. - for (auto& slot : *window) { + auto bucket = dict.map_slots.begin() + t + i; + // Collect all slots from each bucket. + for (auto& slot : *bucket) { auto const key = slot.first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 0949fafe9a4..f4e75f78dec 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -16,7 +16,7 @@ #pragma once -#include "io/comp/gpuinflate.hpp" +#include "io/comp/comp.hpp" #include "io/statistics/statistics.cuh" #include "io/utilities/column_buffer.hpp" #include "orc.hpp" @@ -47,16 +47,16 @@ using slot_type = cuco::pair; auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. -auto constexpr window_size = +auto constexpr bucket_size = 1; ///< Number of concurrent slots (set for best performance) handled by each thread. auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. -using storage_type = cuco::aow_storage, - cudf::detail::cuco_allocator>; +using storage_type = cuco::bucket_storage, + cudf::detail::cuco_allocator>; using storage_ref_type = typename storage_type::ref_type; -using window_type = typename storage_type::window_type; +using bucket_type = typename storage_type::bucket_type; using slot_type = cuco::pair; auto constexpr KEY_SENTINEL = size_type{-1}; @@ -73,14 +73,14 @@ struct CompressedStreamInfo { uint8_t const* compressed_data{}; // [in] base ptr to compressed stream data uint8_t* uncompressed_data{}; // [in] base ptr to uncompressed stream data or NULL if not known yet - size_t compressed_data_size{}; // [in] compressed data size for this stream - device_span* dec_in_ctl{}; // [in] input buffer to decompress - device_span* dec_out_ctl{}; // [in] output buffer to decompress into - device_span dec_res{}; // [in] results of decompression - device_span* copy_in_ctl{}; // [out] input buffer to copy - device_span* copy_out_ctl{}; // [out] output buffer to copy to - uint32_t num_compressed_blocks{}; // [in,out] number of entries in decctl(in), number of - // compressed blocks(out) + size_t compressed_data_size{}; // [in] compressed data size for this stream + device_span* dec_in_ctl{}; // [in] input buffer to decompress + device_span* dec_out_ctl{}; // [in] output buffer to decompress into + device_span dec_res{}; // [in] results of decompression + device_span* copy_in_ctl{}; // [out] input buffer to copy + device_span* copy_out_ctl{}; // [out] output buffer to copy to + uint32_t num_compressed_blocks{}; // [in,out] number of entries in decctl(in), number of + // compressed blocks(out) uint32_t num_uncompressed_blocks{}; // [in,out] number of entries in dec_in_ctl(in), number of // uncompressed blocks(out) uint64_t max_uncompressed_size{}; // [out] maximum uncompressed data size of stream @@ -193,7 +193,7 @@ struct StripeStream { */ struct stripe_dictionary { // input - device_span map_slots; // hash map (windows) storage + device_span map_slots; // hash map (buckets) storage uint32_t column_idx = 0; // column index size_type start_row = 0; // first row in the stripe size_type start_rowgroup = 0; // first rowgroup in the stripe @@ -414,7 +414,7 @@ std::optional CompressOrcDataStreams( bool collect_statistics, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_res, + device_span comp_res, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 0081ed30d17..b661bb4ff90 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -269,7 +269,7 @@ rmm::device_buffer decompress_stripe_data( num_uncompressed_blocks}; device_span> copy_out_view{inflate_out.data() + num_compressed_blocks, num_uncompressed_blocks}; - gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream); + cudf::io::detail::gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream); } // Copy without stream sync, thus need to wait for stream sync below to access. diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 07172b6b7f7..79ecca0ca99 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "io/comp/gpuinflate.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/utilities/block_utils.cuh" #include "io/utilities/time_utils.cuh" @@ -44,7 +45,11 @@ namespace io { namespace orc { namespace gpu { +namespace nvcomp = cudf::io::detail::nvcomp; + using cudf::detail::device_2dspan; +using cudf::io::detail::compression_result; +using cudf::io::detail::compression_status; constexpr int scratch_buffer_size = 512 * 4; constexpr int compact_streams_block_size = 1024; @@ -1385,7 +1390,7 @@ std::optional CompressOrcDataStreams( if (compression == SNAPPY) { try { if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { - gpu_snap(comp_in, comp_out, comp_res, stream); + cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream); } else { nvcomp::batched_compress( nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); @@ -1429,7 +1434,7 @@ std::optional CompressOrcDataStreams( strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size); if (collect_statistics) { - return cudf::io::collect_compression_statistics(comp_in, comp_res, stream); + return cudf::io::detail::collect_compression_statistics(comp_in, comp_res, stream); } else { return std::nullopt; } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 6b9c19368dc..ce868b83c04 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -71,6 +71,8 @@ namespace cudf::io::orc::detail { +namespace nvcomp = cudf::io::detail::nvcomp; + template [[nodiscard]] constexpr int varint_size(T val) { @@ -2023,8 +2025,8 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co { if (compression_kind == NONE) return 0; - return compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), - compression_blocksize); + return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), + compression_blocksize); } std::unique_ptr make_table_meta(table_view const& input) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 5ca94a0bec7..1f45356d36f 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -211,7 +211,7 @@ struct map_find_fn { template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(device_span const map_storage, + populate_chunk_hash_maps_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -240,7 +240,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(device_span const map_storage, + collect_map_entries_kernel(device_span const map_storage, device_span chunks) { auto& chunk = chunks[blockIdx.x]; @@ -252,11 +252,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); - // Iterate over all windows in the map. + // Iterate over all buckets in the map. for (; t < chunk.dict_map_size; t += block_size) { - auto window = map_storage.data() + chunk.dict_map_offset + t; - // Collect all slots from each window. - for (auto& slot : *window) { + auto bucket = map_storage.data() + chunk.dict_map_offset + t; + // Collect all slots from each bucket. + for (auto& slot : *bucket) { auto const key = slot.first; if (key != KEY_SENTINEL) { auto const loc = counter.fetch_add(1, memory_order_relaxed); @@ -273,7 +273,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(device_span const map_storage, + get_dictionary_indices_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -303,7 +303,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) s_ck_start_val_idx); } -void populate_chunk_hash_maps(device_span const map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { @@ -312,7 +312,7 @@ void populate_chunk_hash_maps(device_span const map_storage, <<>>(map_storage, frags); } -void collect_map_entries(device_span const map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream) { @@ -321,7 +321,7 @@ void collect_map_entries(device_span const map_storage, <<>>(map_storage, chunks); } -void get_dictionary_indices(device_span const map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index e9558735929..a1edd21f8a2 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -51,6 +51,9 @@ namespace { using ::cudf::detail::device_2dspan; +using cudf::io::detail::compression_result; +using cudf::io::detail::compression_status; + constexpr int encode_block_size = 128; constexpr int rle_buffer_size = 2 * encode_block_size; constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 7c09764da2d..800875f7448 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -34,7 +34,7 @@ using slot_type = cuco::pair; auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. -auto constexpr window_size = +auto constexpr bucket_size = 1; ///< Number of concurrent slots (set for best performance) handled by each thread. auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. @@ -43,12 +43,12 @@ auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; auto constexpr SCOPE = cuda::thread_scope_block; -using storage_type = cuco::aow_storage, - cudf::detail::cuco_allocator>; +using storage_type = cuco::bucket_storage, + cudf::detail::cuco_allocator>; using storage_ref_type = typename storage_type::ref_type; -using window_type = typename storage_type::window_type; +using bucket_type = typename storage_type::bucket_type; /** * @brief Return the byte length of parquet dtypes that are physically represented by INT32 @@ -100,7 +100,7 @@ inline size_type __device__ row_to_value_idx(size_type idx, * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(device_span const map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); @@ -111,7 +111,7 @@ void populate_chunk_hash_maps(device_span const map_storage, * @param chunks Flat span of chunks to compact hash maps for * @param stream CUDA stream to use */ -void collect_map_entries(device_span const map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream); @@ -128,7 +128,7 @@ void collect_map_entries(device_span const map_storage, * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(device_span const map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index ce9d48693ec..b2563ab5065 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -17,7 +17,7 @@ #pragma once #include "error.hpp" -#include "io/comp/gpuinflate.hpp" +#include "io/comp/comp.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_common.hpp" #include "io/statistics/statistics.cuh" @@ -599,12 +599,12 @@ struct EncColumnChunk { */ struct EncPage { // all pointers at the top to keep things properly aligned - uint8_t* page_data; //!< Ptr to uncompressed page - uint8_t* compressed_data; //!< Ptr to compressed page - EncColumnChunk* chunk; //!< Chunk that this page belongs to - compression_result* comp_res; //!< Ptr to compression result - uint32_t* def_histogram; //!< Histogram of counts for each definition level - uint32_t* rep_histogram; //!< Histogram of counts for each repetition level + uint8_t* page_data; //!< Ptr to uncompressed page + uint8_t* compressed_data; //!< Ptr to compressed page + EncColumnChunk* chunk; //!< Chunk that this page belongs to + cudf::io::detail::compression_result* comp_res; //!< Ptr to compression result + uint32_t* def_histogram; //!< Histogram of counts for each definition level + uint32_t* rep_histogram; //!< Histogram of counts for each repetition level // put this here in case it's ever made 64-bit encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run // the rest can be 4 byte aligned @@ -1023,7 +1023,7 @@ void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, - device_span comp_res, + device_span comp_res, rmm::cuda_stream_view stream); /** @@ -1046,7 +1046,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view * @param[in] stream CUDA stream to use */ void EncodePageHeaders(device_span pages, - device_span comp_res, + device_span comp_res, device_span page_stats, statistics_chunk const* chunk_stats, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b0cbabf1c12..9047ff9169b 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -265,7 +265,6 @@ class stats_expression_converter : public ast::detail::expression_transformer { */ std::reference_wrapper visit(ast::literal const& expr) override { - _stats_expr = std::reference_wrapper(expr); return expr; } @@ -278,7 +277,6 @@ class stats_expression_converter : public ast::detail::expression_transformer { "Statistics AST supports only left table"); CUDF_EXPECTS(expr.get_column_index() < _num_columns, "Column index cannot be more than number of columns in the table"); - _stats_expr = std::reference_wrapper(expr); return expr; } @@ -307,6 +305,9 @@ class stats_expression_converter : public ast::detail::expression_transformer { CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, "Second operand of binary operation with column reference must be a literal"); v->accept(*this); + // Push literal into the ast::tree + auto const& literal = + _stats_expr.push(*dynamic_cast(&operands[1].get())); auto const col_index = v->get_column_index(); switch (op) { /* transform to stats conditions. op(col, literal) @@ -318,34 +319,33 @@ class stats_expression_converter : public ast::detail::expression_transformer { col1 <= val --> vmin <= val */ case ast_operator::EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - auto const& op1 = - _operators.emplace_back(ast_operator::LESS_EQUAL, vmin, operands[1].get()); - auto const& op2 = - _operators.emplace_back(ast_operator::GREATER_EQUAL, vmax, operands[1].get()); - _operators.emplace_back(ast::ast_operator::LOGICAL_AND, op1, op2); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); break; } case ast_operator::NOT_EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - auto const& op1 = _operators.emplace_back(ast_operator::NOT_EQUAL, vmin, vmax); - auto const& op2 = - _operators.emplace_back(ast_operator::NOT_EQUAL, vmax, operands[1].get()); - _operators.emplace_back(ast_operator::LOGICAL_OR, op1, op2); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); break; } case ast_operator::LESS: [[fallthrough]]; case ast_operator::LESS_EQUAL: { - auto const& vmin = _col_ref.emplace_back(col_index * 2); - _operators.emplace_back(op, vmin, operands[1].get()); + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); break; } case ast_operator::GREATER: [[fallthrough]]; case ast_operator::GREATER_EQUAL: { - auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1); - _operators.emplace_back(op, vmax, operands[1].get()); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); break; } default: CUDF_FAIL("Unsupported operation in Statistics AST"); @@ -353,13 +353,12 @@ class stats_expression_converter : public ast::detail::expression_transformer { } else { auto new_operands = visit_operands(operands); if (cudf::ast::detail::ast_operator_arity(op) == 2) { - _operators.emplace_back(op, new_operands.front(), new_operands.back()); + _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _operators.emplace_back(op, new_operands.front()); + _stats_expr.push(ast::operation{op, new_operands.front()}); } } - _stats_expr = std::reference_wrapper(_operators.back()); - return std::reference_wrapper(_operators.back()); + return _stats_expr.back(); } /** @@ -369,7 +368,7 @@ class stats_expression_converter : public ast::detail::expression_transformer { */ [[nodiscard]] std::reference_wrapper get_stats_expr() const { - return _stats_expr.value().get(); + return _stats_expr.back(); } private: @@ -383,10 +382,8 @@ class stats_expression_converter : public ast::detail::expression_transformer { } return transformed_operands; } - std::optional> _stats_expr; + ast::tree _stats_expr; size_type _num_columns; - std::list _col_ref; - std::list _operators; }; } // namespace diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 27312a4da89..933be889b1a 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -15,6 +15,8 @@ */ #include "compact_protocol_reader.hpp" +#include "io/comp/comp.hpp" +#include "io/comp/gpuinflate.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/utilities/time_utils.cuh" #include "reader_impl.hpp" @@ -44,6 +46,10 @@ namespace cudf::io::parquet::detail { namespace { +namespace nvcomp = cudf::io::detail::nvcomp; +using cudf::io::detail::compression_result; +using cudf::io::detail::compression_status; + struct split_info { row_range rows; int64_t split_pos; @@ -795,14 +801,16 @@ std::vector compute_page_splits_by_row(device_span 0) { - debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); + debrotli_scratch.resize(cudf::io::detail::get_gpu_debrotli_scratch_size(codec.num_pages), + stream); } } // Dispatch batches of pages to decompress for each codec. // Buffer needs to be padded, required by `gpuDecodePageData`. rmm::device_buffer decomp_pages( - cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream); + cudf::util::round_up_safe(total_decomp_size, cudf::io::detail::BUFFER_PADDING_MULTIPLE), + stream); auto comp_in = cudf::detail::make_empty_host_vector>(num_comp_pages, stream); @@ -874,8 +882,11 @@ std::vector compute_page_splits_by_row(device_span compute_page_splits_by_row(device_span @@ -251,8 +252,8 @@ void generate_depth_remappings( if (source->is_device_read_preferred(io_size)) { // Buffer needs to be padded. // Required by `gpuDecodePageData`. - page_data[chunk] = - rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream); + page_data[chunk] = rmm::device_buffer( + cudf::util::round_up_safe(io_size, cudf::io::detail::BUFFER_PADDING_MULTIPLE), stream); auto fut_read_size = source->device_read_async( io_offset, io_size, static_cast(page_data[chunk].data()), stream); read_tasks.emplace_back(std::move(fut_read_size)); @@ -261,7 +262,8 @@ void generate_depth_remappings( // Buffer needs to be padded. // Required by `gpuDecodePageData`. page_data[chunk] = rmm::device_buffer( - cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream); + cudf::util::round_up_safe(read_buffer->size(), cudf::io::detail::BUFFER_PADDING_MULTIPLE), + stream); CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(), read_buffer->data(), read_buffer->size(), @@ -550,7 +552,7 @@ void decode_page_headers(pass_intermediate_data& pass, { CUDF_FUNC_RANGE(); - auto iter = thrust::make_counting_iterator(0); + auto iter = thrust::counting_iterator(0); rmm::device_uvector chunk_page_counts(pass.chunks.size() + 1, stream); thrust::transform_exclusive_scan( rmm::exec_policy_nosync(stream), @@ -562,7 +564,7 @@ void decode_page_headers(pass_intermediate_data& pass, return static_cast( i >= num_chunks ? 0 : chunks[i].num_data_pages + chunks[i].num_dict_pages); }), - 0, + size_t{0}, thrust::plus{}); rmm::device_uvector d_chunk_page_info(pass.chunks.size(), stream); thrust::for_each(rmm::exec_policy_nosync(stream), diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 188e6a8c0d8..6b1a20701f9 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -23,6 +23,7 @@ #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "interop/decimal_conversion_utilities.cuh" +#include "io/comp/gpuinflate.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" @@ -1302,7 +1303,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } else { chunk.use_dictionary = true; chunk.dict_map_size = - static_cast(cuco::make_window_extent( + static_cast(cuco::make_bucket_extent( static_cast(occupancy_factor * chunk.num_values))); chunk.dict_map_offset = total_map_storage_size; total_map_storage_size += chunk.dict_map_size; @@ -1317,7 +1318,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, total_map_storage_size, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}}; // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. - device_span const map_storage_data{map_storage.data(), total_map_storage_size}; + device_span const map_storage_data{map_storage.data(), total_map_storage_size}; // Synchronize chunks.host_to_device_async(stream); diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index 396d44c0763..f15ea1f3c37 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -21,6 +21,8 @@ #include "writer_impl_helpers.hpp" +#include "io/comp/nvcomp_adapter.hpp" + #include #include #include diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index a85411594e9..14a9a0ed5b7 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -20,11 +20,11 @@ */ #pragma once -#include "io/comp/nvcomp_adapter.hpp" #include "parquet_common.hpp" #include #include +#include namespace cudf::io::parquet::detail { @@ -42,7 +42,7 @@ Compression to_parquet_compression(compression_type compression); * @param codec Compression codec * @return Translated nvcomp compression type */ -nvcomp::compression_type to_nvcomp_compression_type(Compression codec); +cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec); /** * @brief Function that computes input alignment requirements for the given compression type. diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 06069630685..162da62ef03 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "io/comp/gpuinflate.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/text/device_data_chunks.hpp" @@ -41,6 +42,8 @@ namespace cudf::io::text { namespace { +namespace nvcomp = cudf::io::detail::nvcomp; + /** * @brief Transforms offset tuples of the form [compressed_begin, compressed_end, * decompressed_begin, decompressed_end] into span tuples of the form [compressed_device_span, @@ -73,7 +76,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { { // Buffer needs to be padded. // Required by `inflate_kernel`. - device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream); + device.resize(cudf::util::round_up_safe(host.size(), cudf::io::detail::BUFFER_PADDING_MULTIPLE), + stream); cudf::detail::cuda_memcpy_async( device_span{device}.subspan(0, host.size()), host, stream); } @@ -94,7 +98,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader { rmm::device_uvector d_decompressed_offsets; rmm::device_uvector> d_compressed_spans; rmm::device_uvector> d_decompressed_spans; - rmm::device_uvector d_decompression_results; + rmm::device_uvector d_decompression_results; std::size_t compressed_size_with_headers{}; std::size_t max_decompressed_size{}; // this is usually equal to decompressed_size() @@ -152,16 +156,16 @@ class bgzip_data_chunk_reader : public data_chunk_reader { gpuinflate(d_compressed_spans, d_decompressed_spans, d_decompression_results, - gzip_header_included::NO, + cudf::io::detail::gzip_header_included::NO, stream); } else { - cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE, - d_compressed_spans, - d_decompressed_spans, - d_decompression_results, - max_decompressed_size, - decompressed_size(), - stream); + nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE, + d_compressed_spans, + d_decompressed_spans, + d_decompression_results, + max_decompressed_size, + decompressed_size(), + stream); } } is_decompressed = true; diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 4049ccf35e1..98170ed719a 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -62,8 +62,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates); std::size_t thread_counter{0}; - cudf::size_type const start_idx = threadIdx.x + blockIdx.x * block_size; - cudf::size_type const stride = block_size * gridDim.x; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::size_type const left_num_rows = left_table.num_rows(); cudf::size_type const right_num_rows = right_table.num_rows(); auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); @@ -80,7 +80,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto count_equality = pair_expression_equality{ evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows; + for (auto outer_row_index = start_idx; outer_row_index < outer_num_rows; outer_row_index += stride) { auto query_pair = pair_func(outer_row_index); if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) { diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index fd8629ed6f3..e6e01b9c9fe 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -928,7 +928,7 @@ __launch_bounds__(block_size) CUDF_KERNEL get_json_object_options options) { auto tid = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x}; + auto const stride = cudf::detail::grid_1d::grid_stride(); size_type warp_valid_count{0}; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6984b10eefd..2cc50a4c08b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -132,6 +132,8 @@ ConfigureTest( groupby/groupby_test_util.cpp groupby/groups_tests.cpp groupby/histogram_tests.cpp + groupby/host_udf_example_tests.cu + groupby/host_udf_tests.cpp groupby/keys_tests.cpp groupby/lists_tests.cpp groupby/m2_tests.cpp diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu new file mode 100644 index 00000000000..a454bd692fc --- /dev/null +++ b/cpp/tests/groupby/host_udf_example_tests.cu @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace { +/** + * @brief A host-based UDF implementation for groupby. + * + * For each group of values, the aggregation computes + * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`. + */ +struct host_udf_groupby_example : cudf::host_udf_base { + host_udf_groupby_example() = default; + + [[nodiscard]] data_attribute_set_t get_required_data() const override + { + // We need grouped values, group offsets, group labels, and also results from groups' + // MAX and SUM aggregations. + return {groupby_data_attribute::GROUPED_VALUES, + groupby_data_attribute::GROUP_OFFSETS, + groupby_data_attribute::GROUP_LABELS, + cudf::make_max_aggregation(), + cudf::make_sum_aggregation()}; + } + + [[nodiscard]] output_t get_empty_output( + [[maybe_unused]] std::optional output_dtype, + [[maybe_unused]] rmm::cuda_stream_view stream, + [[maybe_unused]] rmm::device_async_resource_ref mr) const override + { + return cudf::make_empty_column( + cudf::data_type{cudf::type_to_id()}); + } + + [[nodiscard]] output_t operator()(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + auto const& values = + std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); + return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr); + } + + [[nodiscard]] std::size_t do_hash() const override + { + // Just return the same hash for all instances of this class. + return std::size_t{12345}; + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct groupby_fn { + // Store pointer to the parent class so we can call its functions. + host_udf_groupby_example const* parent; + + // For simplicity, this example only accepts double input and always produces double output. + using InputType = double; + using OutputType = double; + + template )> + output_t operator()(Args...) const + { + CUDF_FAIL("Unsupported input type."); + } + + template )> + output_t operator()(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + auto const& values = + std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); + if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); } + + auto const offsets = std::get>( + input.at(groupby_data_attribute::GROUP_OFFSETS)); + CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); + auto const num_groups = static_cast(offsets.size()) - 1; + auto const group_indices = std::get>( + input.at(groupby_data_attribute::GROUP_LABELS)); + auto const group_max = std::get( + input.at(cudf::make_max_aggregation())); + auto const group_sum = std::get( + input.at(cudf::make_sum_aggregation())); + + auto const values_dv_ptr = cudf::column_device_view::create(values, stream); + auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, + num_groups, + cudf::mask_state::UNALLOCATED, + stream, + mr); + + // Store row index if it is valid, otherwise store a negative value denoting a null row. + rmm::device_uvector valid_idx(num_groups, stream); + + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_groups), + thrust::make_zip_iterator(output->mutable_view().begin(), valid_idx.begin()), + transform_fn{*values_dv_ptr, + offsets, + group_indices, + group_max.begin(), + group_sum.begin()}); + + auto const valid_idx_cv = cudf::column_view{ + cudf::data_type{cudf::type_id::INT32}, num_groups, valid_idx.begin(), nullptr, 0}; + return std::move(cudf::gather(cudf::table_view{{output->view()}}, + valid_idx_cv, + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release() + .front()); + } + + struct transform_fn { + cudf::column_device_view values; + cudf::device_span offsets; + cudf::device_span group_indices; + InputType const* group_max; + InputType const* group_sum; + + thrust::tuple __device__ operator()(cudf::size_type idx) const + { + auto const start = offsets[idx]; + auto const end = offsets[idx + 1]; + + auto constexpr invalid_idx = cuda::std::numeric_limits::lowest(); + if (start == end) { return {OutputType{0}, invalid_idx}; } + + auto sum_sqr = OutputType{0}; + bool has_valid{false}; + for (auto i = start; i < end; ++i) { + if (values.is_null(i)) { continue; } + has_valid = true; + auto const val = static_cast(values.element(i)); + sum_sqr += val * val; + } + + if (!has_valid) { return {OutputType{0}, invalid_idx}; } + return {static_cast(group_indices[start] + 1) * sum_sqr - + static_cast(group_max[idx]) * static_cast(group_sum[idx]), + idx}; + } + }; + }; +}; + +} // namespace + +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; + +struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFGroupbyExampleTest, SimpleInput) +{ + double constexpr null = 0.0; + auto const keys = int32s_col{0, 1, 2, 0, 1, 2, 0, 1, 2, 0}; + auto const vals = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0}, + {true, false, true, true, false, true, false, false, true, true}}; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique()); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + cudf::groupby::groupby gb_obj( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + + auto const grp_result = gb_obj.aggregate(requests, cudf::test::get_default_stream()); + auto const& result = grp_result.second[0].results[0]; + + // Output type of groupby is double. + // Values grouped by keys: [ {0, 3, null, 9}, {null, null, null}, {2, 5, 8} ] + // Group sum_sqr: [ 90, null, 93 ] + // Group max: [ 9, null, 8 ] + // Group sum: [ 12, null, 15 ] + // Output: [ 1 * 90 - 9 * 12, null, 3 * 93 - 8 * 15 ] + auto const expected = doubles_col{{-18.0, null, 159.0}, {true, false, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(HostUDFGroupbyExampleTest, EmptyInput) +{ + auto const keys = int32s_col{}; + auto const vals = doubles_col{}; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique()); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + cudf::groupby::groupby gb_obj( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + + auto const grp_result = gb_obj.aggregate(requests, cudf::test::get_default_stream()); + auto const& result = grp_result.second[0].results[0]; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(vals, *result); +} diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp new file mode 100644 index 00000000000..1a0f68c0c6c --- /dev/null +++ b/cpp/tests/groupby/host_udf_tests.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include +#include + +namespace { +/** + * @brief A host-based UDF implementation used for unit tests. + */ +struct host_udf_test_base : cudf::host_udf_base { + int test_location_line; // the location where testing is called + bool* test_run; // to check if the test is accidentally skipped + data_attribute_set_t input_attrs; + + host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_) + : test_location_line{test_location_line_}, + test_run{test_run_}, + input_attrs(std::move(input_attrs_)) + { + } + + [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; } + + // This is the main testing function, which checks for the correctness of input data. + // The rests are just to satisfy the interface. + [[nodiscard]] output_t operator()(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); + + test_data_attributes(input, stream, mr); + + *test_run = true; // test is run successfully + return get_empty_output(std::nullopt, stream, mr); + } + + [[nodiscard]] output_t get_empty_output( + [[maybe_unused]] std::optional output_dtype, + [[maybe_unused]] rmm::cuda_stream_view stream, + [[maybe_unused]] rmm::device_async_resource_ref mr) const override + { + // Unused function - dummy output. + return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + } + + [[nodiscard]] std::size_t do_hash() const override { return 0; } + [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; } + + // The main test function, which must be implemented for each kind of aggregations + // (groupby/reduction/segmented_reduction). + virtual void test_data_attributes(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; + +/** + * @brief A host-based UDF implementation used for unit tests for groupby aggregation. + */ +struct host_udf_groupby_test : host_udf_test_base { + host_udf_groupby_test(int test_location_line_, + bool* test_run_, + data_attribute_set_t input_attrs_ = {}) + : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_)) + { + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(test_location_line, test_run, input_attrs); + } + + void test_data_attributes(input_map_t const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + data_attribute_set_t check_attrs = input_attrs; + if (check_attrs.empty()) { + check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES, + groupby_data_attribute::GROUPED_VALUES, + groupby_data_attribute::SORTED_GROUPED_VALUES, + groupby_data_attribute::NUM_GROUPS, + groupby_data_attribute::GROUP_OFFSETS, + groupby_data_attribute::GROUP_LABELS}; + } + EXPECT_EQ(input.size(), check_attrs.size()); + for (auto const& attr : check_attrs) { + EXPECT_TRUE(input.count(attr) > 0); + EXPECT_TRUE(std::holds_alternative(attr.value) || + std::holds_alternative>(attr.value)); + if (std::holds_alternative(attr.value)) { + switch (std::get(attr.value)) { + case groupby_data_attribute::INPUT_VALUES: + EXPECT_TRUE(std::holds_alternative(input.at(attr))); + break; + case groupby_data_attribute::GROUPED_VALUES: + EXPECT_TRUE(std::holds_alternative(input.at(attr))); + break; + case groupby_data_attribute::SORTED_GROUPED_VALUES: + EXPECT_TRUE(std::holds_alternative(input.at(attr))); + break; + case groupby_data_attribute::NUM_GROUPS: + EXPECT_TRUE(std::holds_alternative(input.at(attr))); + break; + case groupby_data_attribute::GROUP_OFFSETS: + EXPECT_TRUE( + std::holds_alternative>(input.at(attr))); + break; + case groupby_data_attribute::GROUP_LABELS: + EXPECT_TRUE( + std::holds_alternative>(input.at(attr))); + break; + default:; + } + } else { // std::holds_alternative>(attr.value) + EXPECT_TRUE(std::holds_alternative(input.at(attr))); + } + } + } +}; + +/** + * @brief Get a random subset of input data attributes. + */ +cudf::host_udf_base::data_attribute_set_t get_subset( + cudf::host_udf_base::data_attribute_set_t const& attrs) +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution size_distr(1, attrs.size() - 1); + auto const subset_size = size_distr(gen); + auto const elements = + std::vector(attrs.begin(), attrs.end()); + std::uniform_int_distribution idx_distr(0, attrs.size() - 1); + cudf::host_udf_base::data_attribute_set_t output; + while (output.size() < subset_size) { + output.insert(elements[idx_distr(gen)]); + } + return output; +} + +/** + * @brief Generate a random aggregation object from {min, max, sum, product}. + */ +std::unique_ptr get_random_agg() +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distr(1, 4); + switch (distr(gen)) { + case 1: return cudf::make_min_aggregation(); + case 2: return cudf::make_max_aggregation(); + case 3: return cudf::make_sum_aggregation(); + case 4: return cudf::make_product_aggregation(); + default: CUDF_UNREACHABLE("This should not be reached."); + } + return nullptr; +} + +} // namespace + +using int32s_col = cudf::test::fixed_width_column_wrapper; + +// Number of randomly testing on the input data attributes. +// For each test, a subset of data attributes will be randomly generated from all the possible input +// data attributes. The input data corresponding to that subset passed from libcudf will be tested +// for correctness. +constexpr int NUM_RANDOM_TESTS = 20; + +struct HostUDFTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFTest, GroupbyAllInput) +{ + bool test_run = false; + auto const keys = int32s_col{0, 1, 2}; + auto const vals = int32s_col{0, 1, 2}; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique(__LINE__, &test_run)); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + cudf::groupby::groupby gb_obj( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + [[maybe_unused]] auto const grp_result = + gb_obj.aggregate(requests, cudf::test::get_default_stream()); + EXPECT_TRUE(test_run); +} + +TEST_F(HostUDFTest, GroupbySomeInput) +{ + auto const keys = int32s_col{0, 1, 2}; + auto const vals = int32s_col{0, 1, 2}; + auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{ + cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES, + cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES, + cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, + cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS, + cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS, + cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS}; + for (int i = 0; i < NUM_RANDOM_TESTS; ++i) { + bool test_run = false; + auto input_attrs = get_subset(all_attrs); + input_attrs.insert(get_random_agg()); + auto agg = cudf::make_host_udf_aggregation( + std::make_unique(__LINE__, &test_run, std::move(input_attrs))); + + std::vector requests; + requests.emplace_back(); + requests[0].values = vals; + requests[0].aggregations.push_back(std::move(agg)); + cudf::groupby::groupby gb_obj( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + [[maybe_unused]] auto const grp_result = + gb_obj.aggregate(requests, cudf::test::get_default_stream()); + EXPECT_TRUE(test_run); + } +} diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 54262dc3b44..5bbe8b63c47 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -30,6 +30,9 @@ #include using cudf::device_span; +using cudf::io::detail::compression_result; +using cudf::io::detail::compression_status; +namespace nvcomp = cudf::io::detail::nvcomp; /** * @brief Base test fixture for decompression @@ -61,7 +64,7 @@ struct DecompressTest : public cudf::test::BaseFixture { inf_out[0] = dst; inf_out.host_to_device_async(stream); - cudf::detail::hostdevice_vector inf_stat(1, stream); + cudf::detail::hostdevice_vector inf_stat(1, stream); inf_stat[0] = {}; inf_stat.host_to_device_async(stream); @@ -69,7 +72,7 @@ struct DecompressTest : public cudf::test::BaseFixture { CUDF_CUDA_TRY(cudaMemcpyAsync( decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); inf_stat.device_to_host_sync(stream); - ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); + ASSERT_EQ(inf_stat[0].status, compression_status::SUCCESS); } }; @@ -79,13 +82,13 @@ struct DecompressTest : public cudf::test::BaseFixture { struct GzipDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { - cudf::io::gpuinflate(d_inf_in, - d_inf_out, - d_inf_stat, - cudf::io::gzip_header_included::YES, - cudf::get_default_stream()); + cudf::io::detail::gpuinflate(d_inf_in, + d_inf_out, + d_inf_stat, + cudf::io::detail::gzip_header_included::YES, + cudf::get_default_stream()); } }; @@ -95,9 +98,9 @@ struct GzipDecompressTest : public DecompressTest { struct SnappyDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { - cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::get_default_stream()); + cudf::io::detail::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::get_default_stream()); } }; @@ -107,17 +110,17 @@ struct SnappyDecompressTest : public DecompressTest { struct BrotliDecompressTest : public DecompressTest { void dispatch(device_span> d_inf_in, device_span> d_inf_out, - device_span d_inf_stat) + device_span d_inf_stat) { - rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1), + rmm::device_buffer d_scratch{cudf::io::detail::get_gpu_debrotli_scratch_size(1), cudf::get_default_stream()}; - cudf::io::gpu_debrotli(d_inf_in, - d_inf_out, - d_inf_stat, - d_scratch.data(), - d_scratch.size(), - cudf::get_default_stream()); + cudf::io::detail::gpu_debrotli(d_inf_in, + d_inf_out, + d_inf_stat, + d_scratch.data(), + d_scratch.size(), + cudf::get_default_stream()); } }; @@ -181,8 +184,8 @@ TEST_F(BrotliDecompressTest, HelloWorld) TEST_F(NvcompConfigTest, Compression) { - using cudf::io::nvcomp::compression_type; - auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled; + using nvcomp::compression_type; + auto const& comp_disabled = nvcomp::is_compression_disabled; EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true})); // all integrations enabled required @@ -201,8 +204,8 @@ TEST_F(NvcompConfigTest, Compression) TEST_F(NvcompConfigTest, Decompression) { - using cudf::io::nvcomp::compression_type; - auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled; + using nvcomp::compression_type; + auto const& decomp_disabled = nvcomp::is_decompression_disabled; EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true})); // all integrations enabled required diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 37a750330fa..23ca5734ded 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -3450,4 +3450,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}}); } +TEST_F(JsonReaderTest, MismatchedBeginEndTokens) +{ + std::string data = R"({"not_valid": "json)"; + auto opts = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::FAIL) + .build(); + EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index fce99187516..2209a30149d 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -40,6 +40,8 @@ #include #include +namespace nvcomp = cudf::io::detail::nvcomp; + template using column_wrapper = std::conditional_t, @@ -1135,7 +1137,7 @@ TEST_F(OrcReaderTest, SingleInputs) TEST_F(OrcReaderTest, zstdCompressionRegression) { - if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) { + if (nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD)) { GTEST_SKIP() << "Newer nvCOMP version is required"; } @@ -1700,8 +1702,8 @@ TEST_F(OrcMetadataReaderTest, TestNested) TEST_F(OrcReaderTest, ZstdMaxCompressionRate) { - if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD) or - cudf::io::nvcomp::is_compression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) { + if (nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD) or + nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD)) { GTEST_SKIP() << "Newer nvCOMP version is required"; } diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 379750bb0b7..2276b223740 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -62,15 +62,16 @@ enum Kind { LAG(23), PTX(24), CUDA(25), - M2(26), - MERGE_M2(27), - RANK(28), - DENSE_RANK(29), - PERCENT_RANK(30), - TDIGEST(31), // This can take a delta argument for accuracy level - MERGE_TDIGEST(32), // This can take a delta argument for accuracy level - HISTOGRAM(33), - MERGE_HISTOGRAM(34); + HOST_UDF(26), + M2(27), + MERGE_M2(28), + RANK(29), + DENSE_RANK(30), + PERCENT_RANK(31), + TDIGEST(32), // This can take a delta argument for accuracy level + MERGE_TDIGEST(33), // This can take a delta argument for accuracy level + HISTOGRAM(34), + MERGE_HISTOGRAM(35); final int nativeId; @@ -385,6 +386,35 @@ public boolean equals(Object other) { } } + static final class HostUDFAggregation extends Aggregation { + private final HostUDFWrapper wrapper; + + private HostUDFAggregation(HostUDFWrapper wrapper) { + super(Kind.HOST_UDF); + this.wrapper = wrapper; + } + + @Override + long createNativeInstance() { + return Aggregation.createHostUDFAgg(wrapper.udfNativeHandle); + } + + @Override + public int hashCode() { + return 31 * kind.hashCode() + wrapper.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } else if (other instanceof HostUDFAggregation) { + return wrapper.equals(((HostUDFAggregation) other).wrapper); + } + return false; + } + } + protected final Kind kind; protected Aggregation(Kind kind) { @@ -837,6 +867,15 @@ static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nan return new MergeSetsAggregation(nullEquality, nanEquality); } + /** + * Host UDF aggregation, to execute a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new HostUDFAggregation instance + */ + static HostUDFAggregation hostUDF(HostUDFWrapper wrapper) { + return new HostUDFAggregation(wrapper); + } + static final class LeadAggregation extends LeadLagAggregation { private LeadAggregation(int offset, ColumnVector defaultOutput) { super(Kind.LEAD, offset, defaultOutput); @@ -990,4 +1029,9 @@ static MergeHistogramAggregation mergeHistogram() { * Create a TDigest aggregation. */ private static native long createTDigestAgg(int kind, int delta); + + /** + * Create a HOST_UDF aggregation. + */ + private static native long createHostUDFAgg(long udfNativeHandle); } diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java index 0fae33927b6..27966ddfdd4 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java @@ -277,6 +277,15 @@ public static GroupByAggregation mergeSets() { return new GroupByAggregation(Aggregation.mergeSets()); } + /** + * Execute an aggregation using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new GroupByAggregation instance + */ + public static GroupByAggregation hostUDF(HostUDFWrapper wrapper) { + return new GroupByAggregation(Aggregation.hostUDF(wrapper)); + } + /** * Merge the partial sets produced by multiple CollectSetAggregations. * diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java new file mode 100644 index 00000000000..0b6ecf2e140 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +/** + * A wrapper around native host UDF aggregations. + *

+ * This class is used to store the native handle of a host UDF aggregation and is used as + * a proxy object to compute hash code and compare two host UDF aggregations for equality. + *

+ * A new host UDF aggregation implementation must extend this class and override the + * {@code hashCode} and {@code equals} methods for such purposes. + */ +public abstract class HostUDFWrapper { + public final long udfNativeHandle; + + public HostUDFWrapper(long udfNativeHandle) { + this.udfNativeHandle = udfNativeHandle; + } +} diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index c40f1c55500..dd41c677761 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -17,6 +17,7 @@ #include "cudf_jni_apis.hpp" #include +#include extern "C" { @@ -80,25 +81,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv* // case 23: LAG // case 24: PTX // case 25: CUDA - case 26: // M2 + // case 26: HOST_UDF + case 27: // M2 return cudf::make_m2_aggregation(); - case 27: // MERGE_M2 + case 28: // MERGE_M2 return cudf::make_merge_m2_aggregation(); - case 28: // RANK + case 29: // RANK return cudf::make_rank_aggregation( cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE); - case 29: // DENSE_RANK + case 30: // DENSE_RANK return cudf::make_rank_aggregation( cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE); - case 30: // ANSI SQL PERCENT_RANK + case 31: // ANSI SQL PERCENT_RANK return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE, {}, cudf::rank_percentage::ONE_NORMALIZED); - case 33: // HISTOGRAM + // case 32: TDIGEST + // case 33: MERGE_TDIGEST + case 34: // HISTOGRAM return cudf::make_histogram_aggregation(); - case 34: // MERGE_HISTOGRAM + case 35: // MERGE_HISTOGRAM return cudf::make_merge_histogram_aggregation(); default: throw std::logic_error("Unsupported No Parameter Aggregation Operation"); @@ -160,10 +164,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv* std::unique_ptr ret; // These numbers come from Aggregation.java and must stay in sync switch (kind) { - case 31: // TDIGEST + case 32: // TDIGEST ret = cudf::make_tdigest_aggregation(delta); break; - case 32: // MERGE_TDIGEST + case 33: // MERGE_TDIGEST ret = cudf::make_merge_tdigest_aggregation(delta); break; default: throw std::logic_error("Unsupported TDigest Aggregation Operation"); @@ -296,4 +300,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEn CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createHostUDFAgg(JNIEnv* env, + jclass class_object, + jlong udf_native_handle) +{ + JNI_NULL_CHECK(env, udf_native_handle, "udf_native_handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const udf_ptr = reinterpret_cast(udf_native_handle); + auto output = cudf::make_host_udf_aggregation(udf_ptr->clone()); + return reinterpret_cast(output.release()); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 410fd57691e..ff6fba1c3e8 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/__init__.pxd b/python/cudf/cudf/_lib/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 6b5a7814e48..11473d60698 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,12 +1,2 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import numpy as np - -from . import ( - groupby, - strings_udf, -) - -MAX_COLUMN_SIZE = np.iinfo(np.int32).max -MAX_COLUMN_SIZE_STR = "INT32_MAX" -MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max -MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX" +from . import strings_udf diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 245a5d03981..f7dcd89ea48 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -31,12 +31,12 @@ from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.types cimport ( dtype_from_column_view, - dtype_to_data_type, dtype_to_pylibcudf_type, ) from cudf._lib.types import dtype_from_pylibcudf_column +from pylibcudf cimport DataType as plc_DataType cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -361,7 +361,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -398,7 +398,7 @@ cdef class Column: self._data = None return mutable_column_view( - dtype, + dtype.c_obj, self.size, data, mask, @@ -424,7 +424,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data @@ -450,7 +450,7 @@ cdef class Column: cdef libcudf_types.size_type c_null_count = null_count return column_view( - dtype, + dtype.c_obj, self.size, data, mask, diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx deleted file mode 100644 index 80a77ef2267..00000000000 --- a/python/cudf/cudf/_lib/groupby.pyx +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from functools import singledispatch - -from pandas.errors import DataError - -from cudf.api.types import _is_categorical_dtype, is_string_dtype -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ( - CategoricalDtype, - DecimalDtype, - IntervalDtype, - ListDtype, - StructDtype, -) - -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf._lib.scalar import as_device_scalar - -import pylibcudf - -from cudf.core._internals.aggregation import make_aggregation - -# The sets below define the possible aggregations that can be performed on -# different dtypes. These strings must be elements of the AggregationKind enum. -# The libcudf infrastructure exists for "COLLECT" support on -# categoricals, but the dtype support in python does not. -_CATEGORICAL_AGGS = {"COUNT", "NUNIQUE", "SIZE", "UNIQUE"} -_STRING_AGGS = { - "COLLECT", - "COUNT", - "MAX", - "MIN", - "NTH", - "NUNIQUE", - "SIZE", - "UNIQUE", -} -_LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = {"COLLECT", "CORRELATION", "COVARIANCE"} -_INTERVAL_AGGS = {"COLLECT"} -_DECIMAL_AGGS = { - "ARGMIN", - "ARGMAX", - "COLLECT", - "COUNT", - "MAX", - "MIN", - "NTH", - "NUNIQUE", - "SUM", -} - - -@singledispatch -def get_valid_aggregation(dtype): - if is_string_dtype(dtype): - return _STRING_AGGS - return "ALL" - - -@get_valid_aggregation.register -def _(dtype: ListDtype): - return _LIST_AGGS - - -@get_valid_aggregation.register -def _(dtype: CategoricalDtype): - return _CATEGORICAL_AGGS - - -@get_valid_aggregation.register -def _(dtype: ListDtype): - return _LIST_AGGS - - -@get_valid_aggregation.register -def _(dtype: StructDtype): - return _STRUCT_AGGS - - -@get_valid_aggregation.register -def _(dtype: IntervalDtype): - return _INTERVAL_AGGS - - -@get_valid_aggregation.register -def _(dtype: DecimalDtype): - return _DECIMAL_AGGS - - -cdef class GroupBy: - cdef dict __dict__ - - def __init__(self, keys, dropna=True): - with acquire_spill_lock() as spill_lock: - self._groupby = pylibcudf.groupby.GroupBy( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in keys]), - pylibcudf.types.NullPolicy.EXCLUDE if dropna - else pylibcudf.types.NullPolicy.INCLUDE - ) - - # We spill lock the columns while this GroupBy instance is alive. - self._spill_lock = spill_lock - - def groups(self, list values): - """ - Perform a sort groupby, using the keys used to construct the Groupby as the key - columns and ``values`` as the value columns. - - Parameters - ---------- - values: list of Columns - The value columns - - Returns - ------- - offsets: list of integers - Integer offsets such that offsets[i+1] - offsets[i] - represents the size of group `i`. - grouped_keys: list of Columns - The grouped key columns - grouped_values: list of Columns - The grouped value columns - """ - offsets, grouped_keys, grouped_values = self._groupby.get_groups( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]) - if values else None - ) - - return ( - offsets, - columns_from_pylibcudf_table(grouped_keys), - ( - columns_from_pylibcudf_table(grouped_values) - if grouped_values is not None else [] - ), - ) - - def aggregate(self, values, aggregations): - """ - Parameters - ---------- - values : Frame - aggregations - A dict mapping column names in `Frame` to a list of aggregations - to perform on that column - - Each aggregation may be specified as: - - a string (e.g., "max") - - a lambda/function - - Returns - ------- - Frame of aggregated values - """ - included_aggregations = [] - column_included = [] - requests = [] - for i, (col, aggs) in enumerate(zip(values, aggregations)): - valid_aggregations = get_valid_aggregation(col.dtype) - included_aggregations_i = [] - col_aggregations = [] - for agg in aggs: - str_agg = str(agg) - if ( - is_string_dtype(col) - and agg not in _STRING_AGGS - and - ( - str_agg in {"cumsum", "cummin", "cummax"} - or not ( - any(a in str_agg for a in { - "count", - "max", - "min", - "first", - "last", - "nunique", - "unique", - "nth" - }) - or (agg is list) - ) - ) - ): - raise TypeError( - f"function is not supported for this dtype: {agg}" - ) - elif ( - _is_categorical_dtype(col) - and agg not in _CATEGORICAL_AGGS - and ( - str_agg in {"cumsum", "cummin", "cummax"} - or - not ( - any(a in str_agg for a in {"count", "max", "min", "unique"}) - ) - ) - ): - raise TypeError( - f"{col.dtype} type does not support {agg} operations" - ) - - agg_obj = make_aggregation(agg) - if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations: - included_aggregations_i.append((agg, agg_obj.kind)) - col_aggregations.append(agg_obj.c_obj) - included_aggregations.append(included_aggregations_i) - if col_aggregations: - requests.append(pylibcudf.groupby.GroupByRequest( - col.to_pylibcudf(mode="read"), col_aggregations - )) - column_included.append(i) - - if not requests and any(len(v) > 0 for v in aggregations): - raise DataError("All requested aggregations are unsupported.") - - keys, results = self._groupby.scan(requests) if \ - _is_all_scan_aggregate(aggregations) else self._groupby.aggregate(requests) - - result_columns = [[] for _ in range(len(values))] - for i, result in zip(column_included, results): - result_columns[i] = columns_from_pylibcudf_table(result) - - return result_columns, columns_from_pylibcudf_table(keys), included_aggregations - - def shift(self, list values, int periods, list fill_values): - keys, shifts = self._groupby.shift( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), - [periods] * len(values), - [ - ( as_device_scalar(val, dtype=col.dtype)).c_value - for val, col in zip(fill_values, values) - ], - ) - - return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) - - def replace_nulls(self, list values, object method): - _, replaced = self._groupby.replace_nulls( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), - [ - pylibcudf.replace.ReplacePolicy.PRECEDING - if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING - ] * len(values), - ) - - return columns_from_pylibcudf_table(replaced) - - -_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "cumprod", "rank"} - - -def _is_all_scan_aggregate(all_aggs): - """ - Returns true if all are scan aggregations. - Raises - ------ - NotImplementedError - If both reduction aggregations and scan aggregations are present. - """ - - def get_name(agg): - return agg.__name__ if callable(agg) else agg - - all_scan = all( - get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs - for agg_name in aggs - ) - any_scan = any( - get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs - for agg_name in aggs - ) - - if not all_scan and any_scan: - raise NotImplementedError( - "Cannot perform both aggregation and scan in one operation" - ) - return all_scan and any_scan diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 3d3bdd730a8..40bd50acf16 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -10,24 +10,22 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -import pylibcudf +import pylibcudf as plc import cudf -from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES from cudf.core.dtypes import ListDtype, StructDtype +from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT -cimport pylibcudf.libcudf.types as libcudf_types # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - def _replace_nested(obj, check, replacement): if isinstance(obj, list): @@ -62,12 +60,12 @@ def gather_metadata(dtypes): """ out = [] for name, dtype in dtypes.items(): - v = pylibcudf.interop.ColumnMetadata(name) + v = plc.interop.ColumnMetadata(name) if isinstance(dtype, cudf.StructDtype): v.children_meta = gather_metadata(dtype.fields) elif isinstance(dtype, cudf.ListDtype): # Offsets column is unnamed and has no children - v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.append(plc.interop.ColumnMetadata("")) v.children_meta.extend( gather_metadata({"": dtype.element_type}) ) @@ -81,7 +79,7 @@ cdef class DeviceScalar: # that from_unique_ptr is implemented is probably dereferencing this in an # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar) + self.c_value = plc.Scalar.__new__(plc.Scalar) def __init__(self, value, dtype): """ @@ -127,20 +125,20 @@ cdef class DeviceScalar: pa_array = pa.array([pa.scalar(value, type=pa_type)]) pa_table = pa.Table.from_arrays([pa_array], names=[""]) - table = pylibcudf.interop.from_arrow(pa_table) + table = plc.interop.from_arrow(pa_table) column = table.columns()[0] if isinstance(dtype, cudf.core.dtypes.DecimalDtype): if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale) + column = plc.unary.cast( + column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale) ) elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale) + column = plc.unary.cast( + column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale) ) - self.c_value = pylibcudf.copying.get_element(column, 0) + self.c_value = plc.copying.get_element(column, 0) self._dtype = dtype def _to_host_scalar(self): @@ -150,7 +148,7 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA metadata = gather_metadata({"": self.dtype})[0] - ps = pylibcudf.interop.to_arrow(self.c_value, metadata) + ps = plc.interop.to_arrow(self.c_value, metadata) if not ps.is_valid: return null_type @@ -225,34 +223,33 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - + cdef plc_TypeID cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype - elif cdtype.id() in { - libcudf_types.type_id.DECIMAL32, - libcudf_types.type_id.DECIMAL64, - libcudf_types.type_id.DECIMAL128, + elif cdtype_id in { + plc_TypeID.DECIMAL32, + plc_TypeID.DECIMAL64, + plc_TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype.id() == libcudf_types.type_id.STRUCT: + elif cdtype_id == plc_TypeID.STRUCT: struct_table_view = (self.get_raw_ptr())[0].view() self._dtype = StructDtype({ str(i): dtype_from_column_view(struct_table_view.column(i)) for i in range(struct_table_view.num_columns()) }) - elif cdtype.id() == libcudf_types.type_id.LIST: + elif cdtype_id == plc_TypeID.LIST: if ( self.get_raw_ptr() - )[0].view().type().id() == libcudf_types.type_id.LIST: + )[0].view().type().id() == plc_TypeID.LIST: self._dtype = dtype_from_column_view( (self.get_raw_ptr())[0].view() ) else: self._dtype = ListDtype( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ ( (self.get_raw_ptr())[0] .view().type().id() @@ -260,8 +257,8 @@ cdef class DeviceScalar: ] ) else: - self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype.id()) + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + (cdtype_id) ] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index c2b760490c1..18b1d26e4db 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -1,16 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp cimport bool -cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view ctypedef int32_t underlying_type_t_type_id cdef dtype_from_column_view(column_view cv) -cdef libcudf_types.data_type dtype_to_data_type(dtype) except * cpdef dtype_to_pylibcudf_type(dtype) -cdef bool is_decimal_type_id(libcudf_types.type_id tid) except * diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index f169ea12b10..777bd070b32 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from enum import IntEnum - import numpy as np import pandas as pd @@ -11,138 +9,46 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -import pylibcudf +import pylibcudf as plc import cudf -class TypeId(IntEnum): - EMPTY = libcudf_types.type_id.EMPTY - INT8 = libcudf_types.type_id.INT8 - INT16 = libcudf_types.type_id.INT16 - INT32 = libcudf_types.type_id.INT32 - INT64 = libcudf_types.type_id.INT64 - UINT8 = libcudf_types.type_id.UINT8 - UINT16 = libcudf_types.type_id.UINT16 - UINT32 = libcudf_types.type_id.UINT32 - UINT64 = libcudf_types.type_id.UINT64 - FLOAT32 = libcudf_types.type_id.FLOAT32 - FLOAT64 = libcudf_types.type_id.FLOAT64 - BOOL8 = libcudf_types.type_id.BOOL8 - TIMESTAMP_DAYS = ( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - TIMESTAMP_SECONDS = ( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - TIMESTAMP_MILLISECONDS = ( - ( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - ) - TIMESTAMP_MICROSECONDS = ( - ( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - ) - TIMESTAMP_NANOSECONDS = ( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - DURATION_SECONDS = ( - libcudf_types.type_id.DURATION_SECONDS - ) - DURATION_MILLISECONDS = ( - libcudf_types.type_id.DURATION_MILLISECONDS - ) - DURATION_MICROSECONDS = ( - libcudf_types.type_id.DURATION_MICROSECONDS - ) - DURATION_NANOSECONDS = ( - libcudf_types.type_id.DURATION_NANOSECONDS - ) - STRING = libcudf_types.type_id.STRING - DECIMAL32 = libcudf_types.type_id.DECIMAL32 - DECIMAL64 = libcudf_types.type_id.DECIMAL64 - DECIMAL128 = libcudf_types.type_id.DECIMAL128 - STRUCT = libcudf_types.type_id.STRUCT - - -SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = { - np.dtype("int8"): TypeId.INT8, - np.dtype("int16"): TypeId.INT16, - np.dtype("int32"): TypeId.INT32, - np.dtype("int64"): TypeId.INT64, - np.dtype("uint8"): TypeId.UINT8, - np.dtype("uint16"): TypeId.UINT16, - np.dtype("uint32"): TypeId.UINT32, - np.dtype("uint64"): TypeId.UINT64, - np.dtype("float32"): TypeId.FLOAT32, - np.dtype("float64"): TypeId.FLOAT64, - np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): TypeId.STRING, - np.dtype("bool"): TypeId.BOOL8, - np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, -} - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - k: pylibcudf.TypeId(v).value - for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items() + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, } - -LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - # There's no equivalent to EMPTY in cudf. We translate EMPTY - # columns from libcudf to ``int8`` columns of all nulls in Python. - # ``int8`` is chosen because it uses the least amount of memory. - TypeId.EMPTY: np.dtype("int8"), - TypeId.INT8: np.dtype("int8"), - TypeId.INT16: np.dtype("int16"), - TypeId.INT32: np.dtype("int32"), - TypeId.INT64: np.dtype("int64"), - TypeId.UINT8: np.dtype("uint8"), - TypeId.UINT16: np.dtype("uint16"), - TypeId.UINT32: np.dtype("uint32"), - TypeId.UINT64: np.dtype("uint64"), - TypeId.FLOAT32: np.dtype("float32"), - TypeId.FLOAT64: np.dtype("float64"), - TypeId.BOOL8: np.dtype("bool"), - TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), - TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), - TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), - TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), - TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), - TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), - TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), - TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), - TypeId.STRING: np.dtype("object"), - TypeId.STRUCT: np.dtype("object"), -} - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - pylibcudf.TypeId(k).value: v - for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items() + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() } +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") -duration_unit_map = { - TypeId.DURATION_SECONDS: "s", - TypeId.DURATION_MILLISECONDS: "ms", - TypeId.DURATION_MICROSECONDS: "us", - TypeId.DURATION_NANOSECONDS: "ns" -} - -datetime_unit_map = { - TypeId.TIMESTAMP_SECONDS: "s", - TypeId.TIMESTAMP_MILLISECONDS: "ms", - TypeId.TIMESTAMP_MICROSECONDS: "us", - TypeId.TIMESTAMP_NANOSECONDS: "ns", -} -size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] +size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] cdef dtype_from_lists_column_view(column_view cv): @@ -190,71 +96,40 @@ cdef dtype_from_column_view(column_view cv): scale=-cv.type().scale() ) else: - return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (tid) ] -cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: - # Note: This function is to be phased out in favor of - # dtype_to_pylibcudf_type which will return a pylibcudf - # DataType object - cdef libcudf_types.type_id tid - if isinstance(dtype, cudf.ListDtype): - tid = libcudf_types.type_id.LIST - elif isinstance(dtype, cudf.StructDtype): - tid = libcudf_types.type_id.STRUCT - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = libcudf_types.type_id.DECIMAL128 - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = libcudf_types.type_id.DECIMAL64 - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = libcudf_types.type_id.DECIMAL32 - else: - tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)])) - - if is_decimal_type_id(tid): - return libcudf_types.data_type(tid, -dtype.scale) - else: - return libcudf_types.data_type(tid) cpdef dtype_to_pylibcudf_type(dtype): if isinstance(dtype, cudf.ListDtype): - return pylibcudf.DataType(pylibcudf.TypeId.LIST) + return plc.DataType(plc.TypeId.LIST) elif isinstance(dtype, cudf.StructDtype): - return pylibcudf.DataType(pylibcudf.TypeId.STRUCT) + return plc.DataType(plc.TypeId.STRUCT) elif isinstance(dtype, cudf.Decimal128Dtype): - tid = pylibcudf.TypeId.DECIMAL128 - return pylibcudf.DataType(tid, -dtype.scale) + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) elif isinstance(dtype, cudf.Decimal64Dtype): - tid = pylibcudf.TypeId.DECIMAL64 - return pylibcudf.DataType(tid, -dtype.scale) + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) elif isinstance(dtype, cudf.Decimal32Dtype): - tid = pylibcudf.TypeId.DECIMAL32 - return pylibcudf.DataType(tid, -dtype.scale) - # libcudf types don't support localization so convert to the base type + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type elif isinstance(dtype, pd.DatetimeTZDtype): dtype = np.dtype(f"(tid) - ] + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd deleted file mode 100644 index 900be721c9a..00000000000 --- a/python/cudf/cudf/_lib/utils.pxd +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) -cpdef columns_from_pylibcudf_table(tbl) -cpdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx deleted file mode 100644 index 975c9eb741c..00000000000 --- a/python/cudf/cudf/_lib/utils.pyx +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import cudf - -from cudf._lib.column cimport Column - - -cpdef columns_from_pylibcudf_table(tbl): - """Convert a pylibcudf table into list of columns. - - Parameters - ---------- - tbl : pylibcudf.Table - The pylibcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - return [Column.from_pylibcudf(plc) for plc in tbl.columns()] - - -cpdef _data_from_columns(columns, column_names, index_names=None): - """Convert a list of columns into a dict with an index. - - This method is intended to provide the bridge between the columns returned - from calls to libcudf or pylibcudf APIs and the cuDF Python Frame objects, which - require named columns and a separate index. - - Since cuDF Python has an independent representation of a table as a - collection of columns, this function simply returns a dict of columns - suitable for conversion into data to be passed to cuDF constructors. - This method returns the columns of the table in the order they are - stored in libcudf, but calling code is responsible for partitioning and - labeling them as needed. - - Parameters - ---------- - columns : list[Column] - The columns to be extracted - column_names : iterable - The keys associated with the columns in the output data. - index_names : iterable, optional - If provided, an iterable of strings that will be used to label the - corresponding first set of columns into a (Multi)Index. If this - argument is omitted, all columns are assumed to be part of the output - table and no index is constructed. - """ - # First construct the index, if any - index = ( - # TODO: For performance, the _from_data methods of Frame types assume - # that the passed index object is already an Index because cudf.Index - # and cudf.as_index are expensive. As a result, this function is - # currently somewhat inconsistent in returning a dict of columns for - # the data while actually constructing the Index object here (instead - # of just returning a dict for that as well). As we clean up the - # Frame factories we may want to look for a less dissonant approach - # that does not impose performance penalties. - cudf.core.index._index_from_data( - { - name: columns[i] - for i, name in enumerate(index_names) - } - ) - if index_names is not None - else None - ) - n_index_columns = len(index_names) if index_names is not None else 0 - data = { - name: columns[i + n_index_columns] - for i, name in enumerate(column_names) - } - return data, index - - -cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): - return _data_from_columns( - columns_from_pylibcudf_table(tbl), - column_names, - index_names - ) - -cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): - """ - Unpacks the TableWithMetadata from libcudf I/O - into a dict of columns and an Index (cuDF format) - """ - if column_names is None: - column_names = tbl_with_meta.column_names(include_children=False) - return _data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=column_names, - index_names=index_names - ) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index f4543bc6156..c2f3c782d10 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1447,7 +1447,7 @@ def _union(self, other, sort=None): other_df["order"] = other_df.index res = self_df.merge(other_df, on=[0], how="outer") res = res.sort_values( - by=res._data.to_pandas_index()[1:], ignore_index=True + by=res._data.to_pandas_index[1:], ignore_index=True ) union_result = cudf.core.index._index_from_data({0: res._data[0]}) diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py index fe8ea5a947a..1d21d34b1bf 100644 --- a/python/cudf/cudf/core/_internals/aggregation.py +++ b/python/cudf/cudf/core/_internals/aggregation.py @@ -29,11 +29,11 @@ class Aggregation: def __init__(self, agg: plc.aggregation.Aggregation) -> None: - self.c_obj = agg + self.plc_obj = agg @property def kind(self) -> str: - name = self.c_obj.kind().name + name = self.plc_obj.kind().name return _agg_name_map.get(name, name) @classmethod diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index d9b54008e85..b10b8dfe207 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > libcudf.MAX_COLUMN_SIZE: + if newsize > np.iinfo(libcudf.types.size_type_dtype).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"size > {libcudf.types.size_type_dtype}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cccafaeba88..31efe267c96 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1605,7 +1605,7 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.reduce.scan( self.to_pylibcudf(mode="read"), - aggregation.make_aggregation(scan_op, kwargs).c_obj, + aggregation.make_aggregation(scan_op, kwargs).plc_obj, plc.reduce.ScanType.INCLUSIVE if inclusive else plc.reduce.ScanType.EXCLUSIVE, @@ -1637,7 +1637,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: with acquire_spill_lock(): plc_scalar = plc.reduce.reduce( self.to_pylibcudf(mode="read"), - aggregation.make_aggregation(reduction_op, kwargs).c_obj, + aggregation.make_aggregation(reduction_op, kwargs).plc_obj, dtype_to_pylibcudf_type(col_dtype), ) result_col = type(self).from_pylibcudf( @@ -2537,10 +2537,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > libcudf.MAX_COLUMN_SIZE: + if newsize > np.iinfo(libcudf.types.size_type_dtype).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"size > {libcudf.types.size_type_dtype}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index e4fd82e819b..aaf7d071dff 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -207,11 +207,16 @@ def _from_columns_like_self( @property def level_names(self) -> tuple[abc.Hashable, ...]: + if self.is_cached("to_pandas_index"): + return self.to_pandas_index.names if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: return self._level_names + def is_cached(self, attr_name: str) -> bool: + return attr_name in self.__dict__ + @property def nlevels(self) -> int: if len(self) == 0: @@ -262,7 +267,12 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: new_ncols: int len(self) after self._data was modified """ - cached_properties = ("columns", "names", "_grouped_data") + cached_properties = ( + "columns", + "names", + "_grouped_data", + "to_pandas_index", + ) for attr in cached_properties: try: self.__delattr__(attr) @@ -276,6 +286,7 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: except AttributeError: pass + @cached_property def to_pandas_index(self) -> pd.Index: """Convert the keys of the ColumnAccessor to a Pandas Index object.""" if self.multiindex and len(self.level_names) > 0: @@ -726,10 +737,10 @@ def droplevel(self, level: int) -> None: } new_ncols = len(self) self._level_names = ( - self._level_names[:level] + self._level_names[level + 1 :] + self.level_names[:level] + self.level_names[level + 1 :] ) - if len(self._level_names) == 1: + if len(self.level_names) == 1: # can't use nlevels, as it depends on multiindex self.multiindex = False self._clear_cache(old_ncols, new_ncols) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e66e4f41642..3334b57ce1b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -961,7 +961,7 @@ def _init_from_series_list(self, data, columns, index): warnings.simplefilter("ignore", FutureWarning) concat_df = cudf.concat(data, axis=1) - cols = concat_df._data.to_pandas_index() + cols = concat_df._data.to_pandas_index if cols.dtype == "object": concat_df.columns = cols.astype("str") @@ -2092,7 +2092,7 @@ def _make_operands_and_index_for_binop( equal_columns = True elif isinstance(other, Series): if ( - not (self_pd_columns := self._data.to_pandas_index()).equals( + not (self_pd_columns := self._data.to_pandas_index).equals( other_pd_index := other.index.to_pandas() ) and not can_reindex @@ -2117,8 +2117,8 @@ def _make_operands_and_index_for_binop( and fn in cudf.utils.utils._EQUALITY_OPS and ( not self.index.equals(other.index) - or not self._data.to_pandas_index().equals( - other._data.to_pandas_index() + or not self._data.to_pandas_index.equals( + other._data.to_pandas_index ) ) ): @@ -2162,11 +2162,11 @@ def _make_operands_and_index_for_binop( if not equal_columns: if isinstance(other, DataFrame): - column_names_list = self._data.to_pandas_index().join( - other._data.to_pandas_index(), how="outer" + column_names_list = self._data.to_pandas_index.join( + other._data.to_pandas_index, how="outer" ) elif isinstance(other, Series): - column_names_list = self._data.to_pandas_index().join( + column_names_list = self._data.to_pandas_index.join( other.index.to_pandas(), how="outer" ) else: @@ -2626,8 +2626,8 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - self_cols = self._data.to_pandas_index() - if not self_cols.equals(other._data.to_pandas_index()): + self_cols = self._data.to_pandas_index + if not self_cols.equals(other._data.to_pandas_index): other = other.reindex(self_cols, axis=1) if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) @@ -2663,7 +2663,7 @@ def __iter__(self): def __contains__(self, item): # This must check against containment in the pandas Index and not # self._column_names to handle NA, None, nan, etc. correctly. - return item in self._data.to_pandas_index() + return item in self._data.to_pandas_index @_performance_tracking def items(self): @@ -2700,14 +2700,14 @@ def at(self): @property # type: ignore @_external_only_api( - "Use _column_names instead, or _data.to_pandas_index() if a pandas " + "Use _column_names instead, or _data.to_pandas_index if a pandas " "index is absolutely necessary. For checking if the columns are a " "MultiIndex, use _data.multiindex." ) @_performance_tracking def columns(self): """Returns a tuple of columns""" - return self._data.to_pandas_index() + return self._data.to_pandas_index @columns.setter # type: ignore @_performance_tracking @@ -2916,7 +2916,7 @@ def reindex( df = self else: columns = cudf.Index(columns) - intersection = self._data.to_pandas_index().intersection( + intersection = self._data.to_pandas_index.intersection( columns.to_pandas() ) df = self.loc[:, intersection] @@ -3430,7 +3430,7 @@ def axes(self): Index(['key', 'k2', 'val', 'temp'], dtype='object')] """ - return [self.index, self._data.to_pandas_index()] + return [self.index, self._data.to_pandas_index] def diff(self, periods=1, axis=0): """ @@ -4129,7 +4129,7 @@ def transpose(self): Not supporting *copy* because default and only behavior is copy=True """ - index = self._data.to_pandas_index() + index = self._data.to_pandas_index columns = self.index.copy(deep=False) if self._num_columns == 0 or self._num_rows == 0: return DataFrame(index=index, columns=columns) @@ -5535,7 +5535,7 @@ def to_pandas( } out_df = pd.DataFrame(out_data, index=out_index) - out_df.columns = self._data.to_pandas_index() + out_df.columns = self._data.to_pandas_index return out_df @@ -6487,7 +6487,7 @@ def _reduce( source = self._get_columns_by_label(numeric_cols) if source.empty: return Series( - index=self._data.to_pandas_index()[:0] + index=self._data.to_pandas_index[:0] if axis == 0 else source.index, dtype="float64", @@ -6540,7 +6540,7 @@ def _reduce( "Columns must all have the same dtype to " f"perform {op=} with {axis=}" ) - pd_index = source._data.to_pandas_index() + pd_index = source._data.to_pandas_index if source._data.multiindex: idx = MultiIndex.from_pandas(pd_index) else: @@ -7242,7 +7242,7 @@ def stack( ] has_unnamed_levels = len(unnamed_levels_indices) > 0 - column_name_idx = self._data.to_pandas_index() + column_name_idx = self._data.to_pandas_index # Construct new index from the levels specified by `level` named_levels = pd.MultiIndex.from_arrays( [column_name_idx.get_level_values(lv) for lv in level_indices] @@ -7432,7 +7432,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): ) cov = cupy.cov(self.values, ddof=ddof, rowvar=False) - cols = self._data.to_pandas_index() + cols = self._data.to_pandas_index df = DataFrame(cupy.asfortranarray(cov), index=cols) df._set_columns_like(self._data) return df @@ -7475,7 +7475,7 @@ def corr( ) corr = cupy.corrcoef(values, rowvar=False) - cols = self._data.to_pandas_index() + cols = self._data.to_pandas_index df = DataFrame(cupy.asfortranarray(corr), index=cols) df._set_columns_like(self._data) return df @@ -7544,7 +7544,7 @@ def keys(self): >>> df.keys() Index([0, 1, 2, 3], dtype='int64') """ - return self._data.to_pandas_index() + return self._data.to_pandas_index def itertuples(self, index=True, name="Pandas"): """ @@ -7778,7 +7778,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series: raise NotImplementedError("axis parameter is not supported yet.") counts = [col.distinct_count(dropna=dropna) for col in self._columns] return self._constructor_sliced( - counts, index=self._data.to_pandas_index() + counts, index=self._data.to_pandas_index ) def _sample_axis_1( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 971f0be77f8..8ed233ba737 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -56,7 +56,9 @@ def dtype(arbitrary): else: if np_dtype.kind in set("OU"): return np.dtype("object") - elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: + elif ( + np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9aadbf8f47a..8f45c6f0115 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -861,7 +861,9 @@ def _quantile_table( column_order, null_precedence, ) - columns = libcudf.utils.columns_from_pylibcudf_table(plc_table) + columns = [ + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() + ] return self._from_columns_like_self( columns, column_names=self._column_names, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6cd8e11695f..be3cc410174 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -4,9 +4,10 @@ import copy import itertools import textwrap +import types import warnings from collections import abc -from functools import cached_property +from functools import cached_property, singledispatch from typing import TYPE_CHECKING, Any, Literal import cupy as cp @@ -18,17 +19,27 @@ import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib import groupby as libgroupby from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default -from cudf.api.types import is_list_like, is_numeric_dtype +from cudf.api.types import ( + is_list_like, + is_numeric_dtype, + is_string_dtype, +) from cudf.core._compat import PANDAS_LT_300 -from cudf.core._internals import sorting +from cudf.core._internals import aggregation, sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock -from cudf.core.column.column import ColumnBase, StructDtype, as_column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import GatherMap +from cudf.core.dtypes import ( + CategoricalDtype, + DecimalDtype, + IntervalDtype, + ListDtype, + StructDtype, +) from cudf.core.join._join_helpers import _match_join_keys from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex @@ -37,7 +48,7 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from collections.abc import Iterable + from collections.abc import Generator, Iterable from cudf._typing import ( AggType, @@ -46,6 +57,152 @@ ScalarLike, ) +# The sets below define the possible aggregations that can be performed on +# different dtypes. These strings must be elements of the AggregationKind enum. +# The libcudf infrastructure exists for "COLLECT" support on +# categoricals, but the dtype support in python does not. +_CATEGORICAL_AGGS = {"COUNT", "NUNIQUE", "SIZE", "UNIQUE"} +_STRING_AGGS = { + "COLLECT", + "COUNT", + "MAX", + "MIN", + "NTH", + "NUNIQUE", + "SIZE", + "UNIQUE", +} +_LIST_AGGS = {"COLLECT"} +_STRUCT_AGGS = {"COLLECT", "CORRELATION", "COVARIANCE"} +_INTERVAL_AGGS = {"COLLECT"} +_DECIMAL_AGGS = { + "ARGMIN", + "ARGMAX", + "COLLECT", + "COUNT", + "MAX", + "MIN", + "NTH", + "NUNIQUE", + "SUM", +} + + +@singledispatch +def get_valid_aggregation(dtype): + if is_string_dtype(dtype): + return _STRING_AGGS + return "ALL" + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: CategoricalDtype): + return _CATEGORICAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: StructDtype): + return _STRUCT_AGGS + + +@get_valid_aggregation.register +def _(dtype: IntervalDtype): + return _INTERVAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: DecimalDtype): + return _DECIMAL_AGGS + + +@singledispatch +def _is_unsupported_agg_for_type(dtype, str_agg: str) -> bool: + return False + + +@_is_unsupported_agg_for_type.register +def _(dtype: np.dtype, str_agg: str) -> bool: + # string specifically + cumulative_agg = str_agg in {"cumsum", "cummin", "cummax"} + basic_agg = any( + a in str_agg + for a in ( + "count", + "max", + "min", + "first", + "last", + "nunique", + "unique", + "nth", + ) + ) + return ( + dtype.kind == "O" + and str_agg not in _STRING_AGGS + and (cumulative_agg or not (basic_agg or str_agg == "")) + ) + + +@_is_unsupported_agg_for_type.register +def _(dtype: CategoricalDtype, str_agg: str) -> bool: + cumulative_agg = str_agg in {"cumsum", "cummin", "cummax"} + not_basic_agg = not any( + a in str_agg for a in ("count", "max", "min", "unique") + ) + return str_agg not in _CATEGORICAL_AGGS and ( + cumulative_agg or not_basic_agg + ) + + +def _is_all_scan_aggregate(all_aggs: list[list[str]]) -> bool: + """ + Returns True if all are scan aggregations. + + Raises + ------ + NotImplementedError + If both reduction aggregations and scan aggregations are present. + """ + groupby_scans = { + "cumcount", + "cumsum", + "cummin", + "cummax", + "cumprod", + "rank", + } + + def get_name(agg): + return agg.__name__ if callable(agg) else agg + + all_scan = all( + get_name(agg_name) in groupby_scans + for aggs in all_aggs + for agg_name in aggs + ) + any_scan = any( + get_name(agg_name) in groupby_scans + for aggs in all_aggs + for agg_name in aggs + ) + + if not all_scan and any_scan: + raise NotImplementedError( + "Cannot perform both aggregation and scan in one operation" + ) + return all_scan and any_scan + def _deprecate_collect(): warnings.warn( @@ -423,7 +580,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: >>> df.groupby(by=["a"]).indices {10: array([0, 1]), 40: array([2])} """ - offsets, group_keys, (indices,) = self._groupby.groups( + offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( range(len(self.obj)), dtype=size_type_dtype @@ -582,11 +739,137 @@ def rank(x): return result @cached_property - def _groupby(self): - return libgroupby.GroupBy( - [*self.grouping.keys._columns], dropna=self._dropna + def _groupby(self) -> types.SimpleNamespace: + with acquire_spill_lock() as spill_lock: + plc_groupby = plc.groupby.GroupBy( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in self.grouping.keys._columns + ] + ), + plc.types.NullPolicy.EXCLUDE + if self._dropna + else plc.types.NullPolicy.INCLUDE, + ) + # Do we need this because we just check _spill_locks in test_spillable_df_groupby? + return types.SimpleNamespace( + plc_groupby=plc_groupby, _spill_locks=spill_lock + ) + + def _groups( + self, values: Iterable[ColumnBase] + ) -> tuple[list[int], list[ColumnBase], list[ColumnBase]]: + plc_columns = [col.to_pylibcudf(mode="read") for col in values] + if not plc_columns: + plc_table = None + else: + plc_table = plc.Table(plc_columns) + offsets, grouped_keys, grouped_values = ( + self._groupby.plc_groupby.get_groups(plc_table) + ) + + return ( + offsets, + [ColumnBase.from_pylibcudf(col) for col in grouped_keys.columns()], + ( + [ + ColumnBase.from_pylibcudf(col) + for col in grouped_values.columns() + ] + if grouped_values is not None + else [] + ), + ) + + def _aggregate( + self, values: tuple[ColumnBase, ...], aggregations + ) -> tuple[ + list[list[ColumnBase]], + list[ColumnBase], + list[list[tuple[str, str]]], + ]: + included_aggregations = [] + column_included = [] + requests = [] + result_columns: list[list[ColumnBase]] = [] + for i, (col, aggs) in enumerate(zip(values, aggregations)): + valid_aggregations = get_valid_aggregation(col.dtype) + included_aggregations_i = [] + col_aggregations = [] + for agg in aggs: + str_agg = str(agg) + if _is_unsupported_agg_for_type(col.dtype, str_agg): + raise TypeError( + f"{col.dtype} type does not support {agg} operations" + ) + agg_obj = aggregation.make_aggregation(agg) + if ( + valid_aggregations == "ALL" + or agg_obj.kind in valid_aggregations + ): + included_aggregations_i.append((agg, agg_obj.kind)) + col_aggregations.append(agg_obj.plc_obj) + included_aggregations.append(included_aggregations_i) + result_columns.append([]) + if col_aggregations: + requests.append( + plc.groupby.GroupByRequest( + col.to_pylibcudf(mode="read"), col_aggregations + ) + ) + column_included.append(i) + + if not requests and any(len(v) > 0 for v in aggregations): + raise pd.errors.DataError( + "All requested aggregations are unsupported." + ) + + keys, results = ( + self._groupby.plc_groupby.scan(requests) + if _is_all_scan_aggregate(aggregations) + else self._groupby.plc_groupby.aggregate(requests) ) + for i, result in zip(column_included, results): + result_columns[i] = [ + ColumnBase.from_pylibcudf(col) for col in result.columns() + ] + + return ( + result_columns, + [ColumnBase.from_pylibcudf(key) for key in keys.columns()], + included_aggregations, + ) + + def _shift( + self, values: tuple[ColumnBase, ...], periods: int, fill_values: list + ) -> Generator[ColumnBase]: + _, shifts = self._groupby.plc_groupby.shift( + plc.table.Table([col.to_pylibcudf(mode="read") for col in values]), + [periods] * len(values), + [ + cudf.Scalar(val, dtype=col.dtype).device_value.c_value + for val, col in zip(fill_values, values) + ], + ) + return (ColumnBase.from_pylibcudf(col) for col in shifts.columns()) + + def _replace_nulls( + self, values: tuple[ColumnBase, ...], method: str + ) -> Generator[ColumnBase]: + _, replaced = self._groupby.plc_groupby.replace_nulls( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + [ + plc.replace.ReplacePolicy.PRECEDING + if method == "ffill" + else plc.replace.ReplacePolicy.FOLLOWING + ] + * len(values), + ) + + return (ColumnBase.from_pylibcudf(col) for col in replaced.columns()) + @_performance_tracking def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): """ @@ -702,7 +985,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): result_columns, grouped_key_cols, included_aggregations, - ) = self._groupby.aggregate(columns, normalized_aggs) + ) = self._aggregate(columns, normalized_aggs) result_index = self.grouping.keys._from_columns_like_self( grouped_key_cols, @@ -761,7 +1044,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): else: if cudf.get_option( "mode.pandas_compatible" - ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): + ) and not _is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. left_cols = list(self.grouping.keys.drop_duplicates()._columns) @@ -810,7 +1093,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if not self._as_index: result = result.reset_index() - if libgroupby._is_all_scan_aggregate(normalized_aggs): + if _is_all_scan_aggregate(normalized_aggs): # Scan aggregations return rows in original index order return self._mimic_pandas_order(result) @@ -920,7 +1203,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # Can't use _mimic_pandas_order because we need to # subsample the gather map from the full input ordering, # rather than permuting the gather map of the output. - _, _, (ordering,) = self._groupby.groups( + _, _, (ordering,) = self._groups( [as_column(range(0, len(self.obj)))] ) # Invert permutation from original order to groups on the @@ -1312,8 +1595,8 @@ def deserialize(cls, header, frames): return cls(obj, grouping, **kwargs) def _grouped(self, *, include_groups: bool = True): - offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups( - [*self.obj.index._columns, *self.obj._columns] + offsets, grouped_key_cols, grouped_value_cols = self._groups( + itertools.chain(self.obj.index._columns, self.obj._columns) ) grouped_keys = cudf.core.index._index_from_data( dict(enumerate(grouped_key_cols)) @@ -1945,7 +2228,7 @@ def transform( "Currently, `transform()` supports only aggregations." ) from e # If the aggregation is a scan, don't broadcast - if libgroupby._is_all_scan_aggregate([[func]]): + if _is_all_scan_aggregate([[func]]): if len(result) != len(self.obj): raise AssertionError( "Unexpected result length for scan transform" @@ -2409,7 +2692,7 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: dict( zip( values._column_names, - self._groupby.replace_nulls([*values._columns], method), + self._replace_nulls(values._columns, method), ) ) ) @@ -2513,7 +2796,7 @@ def fillna( @_performance_tracking def shift( self, - periods=1, + periods: int = 1, freq=None, axis=0, fill_value=None, @@ -2560,7 +2843,7 @@ def shift( if freq is not None: raise NotImplementedError("Parameter freq is unsupported.") - if not axis == 0: + if axis != 0: raise NotImplementedError("Only axis=0 is supported.") if suffix is not None: @@ -2568,20 +2851,18 @@ def shift( values = self.grouping.values if is_list_like(fill_value): - if len(fill_value) != len(values._data): + if len(fill_value) != values._num_columns: raise ValueError( "Mismatched number of columns and values to fill." ) else: - fill_value = [fill_value] * len(values._data) + fill_value = [fill_value] * values._num_columns result = self.obj.__class__._from_data( dict( zip( values._column_names, - self._groupby.shift( - [*values._columns], periods, fill_value - )[0], + self._shift(values._columns, periods, fill_value), ) ) ) @@ -2680,9 +2961,7 @@ def _mimic_pandas_order( # result coming back from libcudf has null_count few rows than # the input, so we must produce an ordering from the full # input range. - _, _, (ordering,) = self._groupby.groups( - [as_column(range(0, len(self.obj)))] - ) + _, _, (ordering,) = self._groups([as_column(range(0, len(self.obj)))]) if self._dropna and any( c.has_nulls(include_nan=True) > 0 for c in self.grouping._key_columns @@ -3087,7 +3366,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # drop the first level if we have a multiindex if result._data.nlevels > 1: - result.columns = result._data.to_pandas_index().droplevel(0) + result.columns = result._data.to_pandas_index.droplevel(0) return result diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a288eb245e0..6ee24283491 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1106,13 +1106,11 @@ def dot(self, other, reflect=False): lhs = self.reindex(index=common, copy=False).values rhs = other.reindex(index=common, copy=False).values if isinstance(other, cudf.DataFrame): - result_index = other._data.to_pandas_index() + result_index = other._data.to_pandas_index elif isinstance(self, cudf.DataFrame) and isinstance( other, (cudf.Series, cudf.DataFrame) ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) + common = self._data.to_pandas_index.union(other.index.to_pandas()) if len(common) > self._num_columns or len(common) > len( other.index ): @@ -1124,7 +1122,7 @@ def dot(self, other, reflect=False): rhs = other.reindex(index=common, copy=False).values lhs = lhs.values if isinstance(other, cudf.DataFrame): - result_cols = other._data.to_pandas_index() + result_cols = other._data.to_pandas_index elif isinstance( other, (cp.ndarray, np.ndarray) @@ -2244,7 +2242,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): if not copy: raise ValueError("Truncating with copy=False is not supported.") axis = self._get_axis_from_axis_arg(axis) - ax = self.index if axis == 0 else self._data.to_pandas_index() + ax = self.index if axis == 0 else self._data.to_pandas_index if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: raise ValueError("truncate requires a sorted index") @@ -6778,7 +6776,7 @@ def _drop_rows_by_labels( return obj.__class__._from_data( join_res.iloc[:, idx_nlv:]._data, index=midx, - columns=obj._data.to_pandas_index(), + columns=obj._data.to_pandas_index, ) else: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index d2afe643dc4..1e613e49ffc 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1123,7 +1123,7 @@ def _concat(cls, objs) -> Self: # TODO: Verify if this is really necessary or if we can rely on # DataFrame._concat. if len(source_data) > 1: - colnames = source_data[0]._data.to_pandas_index() + colnames = source_data[0]._data.to_pandas_index for obj in source_data[1:]: obj.columns = colnames @@ -2068,7 +2068,7 @@ def _union(self, other, sort=None) -> Self: result_df = self_df.merge(other_df, on=col_names, how="outer") result_df = result_df.sort_values( - by=result_df._data.to_pandas_index()[self.nlevels :], + by=result_df._data.to_pandas_index[self.nlevels :], ignore_index=True, ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3ab6ed306b6..0abd42d4d4e 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -431,8 +431,9 @@ def concat( result_columns = ( objs[0] - ._data.to_pandas_index() - .append([obj._data.to_pandas_index() for obj in objs[1:]]) + ._data.to_pandas_index.append( + [obj._data.to_pandas_index for obj in objs[1:]] + ) .unique() ) @@ -689,7 +690,7 @@ def _tile(A, reps): if not value_vars: # TODO: Use frame._data.label_dtype when it's more consistently set var_data = cudf.Series( - value_vars, dtype=frame._data.to_pandas_index().dtype + value_vars, dtype=frame._data.to_pandas_index.dtype ) else: var_data = ( @@ -1273,7 +1274,7 @@ def unstack(df, level, fill_value=None, sort: bool = True): res = df.T.stack(future_stack=False) # Result's index is a multiindex res.index.names = ( - tuple(df._data.to_pandas_index().names) + df.index.names + tuple(df._data.to_pandas_index.names) + df.index.names ) return res else: diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bfe716f0afc..4bd5a1e7040 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -9,7 +9,7 @@ import cupy as cp import llvmlite.binding as ll import numpy as np -from cuda import cudart +from cuda.bindings import runtime from numba import cuda, typeof from numba.core.datamodel import default_manager, models from numba.core.errors import TypingError @@ -356,8 +356,8 @@ def set_malloc_heap_size(size=None): if size is None: size = _STRINGS_UDF_DEFAULT_HEAP_SIZE if size != _heap_size: - (ret,) = cudart.cudaDeviceSetLimit( - cudart.cudaLimit.cudaLimitMallocHeapSize, size + (ret,) = runtime.cudaDeviceSetLimit( + runtime.cudaLimit.cudaLimitMallocHeapSize, size ) if ret.value != 0: raise RuntimeError("Unable to set cudaMalloc heap size") diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index a580c35ccbf..2f8a6d9e5e7 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -315,7 +315,7 @@ def _apply_agg_column(self, source_column, agg_name): {"dtype": source_column.dtype} if callable(agg_name) else self.agg_params, - ).c_obj, + ).plc_obj, ) ) diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 4966cdb86e1..dcbdd4423fc 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -3,7 +3,7 @@ import pylibcudf as plc import cudf -from cudf._lib.utils import data_from_pylibcudf_io +from cudf._lib.column import Column from cudf.utils import ioutils @@ -46,5 +46,12 @@ def read_avro( options.set_columns(columns) plc_result = plc.io.avro.read_avro(options) - - return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result)) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + plc_result.column_names(include_children=False), + plc_result.columns, + strict=True, + ) + } + return cudf.DataFrame._from_data(data) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index da9a66f3874..6d617cbf38e 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -15,8 +15,8 @@ import pylibcudf as plc import cudf +from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils @@ -251,9 +251,17 @@ def read_csv( if na_values is not None: options.set_na_values([str(val) for val in na_values]) - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) - ) + table_w_meta = plc.io.csv.read_csv(options) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + table_w_meta.column_names(include_children=False), + table_w_meta.columns, + strict=True, + ) + } + + df = cudf.DataFrame._from_data(data) if isinstance(dtype, abc.Mapping): for k, v in dtype.items(): diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 4f0709ec985..ff326e09315 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -15,7 +15,6 @@ import cudf from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import _data_from_columns, data_from_pylibcudf_io from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type @@ -54,6 +53,22 @@ def _get_cudf_schema_element_from_dtype( return lib_type, child_types +def _to_plc_compression( + compression: Literal["infer", "gzip", "bz2", "zip", "xz", None], +) -> plc.io.types.CompressionType: + if compression is not None: + if compression == "gzip": + return plc.io.types.CompressionType.GZIP + elif compression == "bz2": + return plc.io.types.CompressionType.BZIP2 + elif compression == "zip": + return plc.io.types.CompressionType.ZIP + else: + return plc.io.types.CompressionType.AUTO + else: + return plc.io.types.CompressionType.NONE + + @ioutils.doc_read_json() def read_json( path_or_buf, @@ -115,17 +130,7 @@ def read_json( if isinstance(source, str) and not os.path.isfile(source): filepaths_or_buffers[idx] = source.encode() - if compression is not None: - if compression == "gzip": - c_compression = plc.io.types.CompressionType.GZIP - elif compression == "bz2": - c_compression = plc.io.types.CompressionType.BZIP2 - elif compression == "zip": - c_compression = plc.io.types.CompressionType.ZIP - else: - c_compression = plc.io.types.CompressionType.AUTO - else: - c_compression = plc.io.types.CompressionType.NONE + c_compression = _to_plc_compression(compression) if on_bad_lines.lower() == "error": c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL @@ -161,44 +166,53 @@ def read_json( if cudf.get_option("io.json.low_memory") and lines: res_cols, res_col_names, res_child_names = ( plc.io.json.chunked_read_json( + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=c_on_bad_lines, + ) + ) + ) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip(res_col_names, res_cols, strict=True) + } + df = cudf.DataFrame._from_data(data) + ioutils._add_df_col_struct_names(df, res_child_names) + return df + else: + table_w_meta = plc.io.json.read_json( + plc.io.json._setup_json_reader_options( plc.io.SourceInfo(filepaths_or_buffers), processed_dtypes, c_compression, + lines, + byte_range_offset=byte_range[0] + if byte_range is not None + else 0, + byte_range_size=byte_range[1] + if byte_range is not None + else 0, keep_quotes=keep_quotes, mixed_types_as_string=mixed_types_as_string, prune_columns=prune_columns, recovery_mode=c_on_bad_lines, + extra_parameters=kwargs, ) ) - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(col) for col in res_cols], - column_names=res_col_names, - index_names=None, + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + table_w_meta.column_names(include_children=False), + table_w_meta.columns, + strict=True, ) - ) - ioutils._add_df_col_struct_names(df, res_child_names) - return df - else: - table_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - lines, - byte_range_offset=byte_range[0] - if byte_range is not None - else 0, - byte_range_size=byte_range[1] if byte_range is not None else 0, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=c_on_bad_lines, - extra_parameters=kwargs, - ) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(table_w_meta) - ) + } + df = cudf.DataFrame._from_data(data) # Post-processing to add in struct column names ioutils._add_df_col_struct_names(df, table_w_meta.child_names) @@ -285,23 +299,29 @@ def _plc_write_json( include_nulls: bool = True, lines: bool = False, rows_per_chunk: int = 1024 * 64, # 64K rows + compression: Literal["infer", "gzip", "bz2", "zip", "xz", None] = None, ) -> None: try: - plc.io.json.write_json( - plc.io.SinkInfo([path_or_buf]), - plc.io.TableWithMetadata( - plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ), - colnames, + tbl_w_meta = plc.io.TableWithMetadata( + plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] ), - na_rep, - include_nulls, - lines, - rows_per_chunk, - true_value="true", - false_value="false", + colnames, + ) + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl + ) + .metadata(tbl_w_meta) + .na_rep(na_rep) + .include_nulls(include_nulls) + .lines(lines) + .compression(_to_plc_compression(compression)) + .build() ) + if rows_per_chunk != np.iinfo(np.int32).max: + options.set_rows_per_chunk(rows_per_chunk) + plc.io.json.write_json(options) except OverflowError as err: raise OverflowError( f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 5616413b7e4..f3124552fd1 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -10,10 +10,11 @@ import pylibcudf as plc import cudf +from cudf._lib.column import Column from cudf._lib.types import dtype_to_pylibcudf_type -from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock +from cudf.core.index import _index_from_data from cudf.utils import ioutils try: @@ -240,15 +241,27 @@ def read_orc( elif not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - tbl_w_meta = plc.io.orc.read_orc( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - stripes, - skiprows, - num_rows, - use_index, - dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)), + options = ( + plc.io.orc.OrcReaderOptions.builder( + plc.io.types.SourceInfo(filepaths_or_buffers) + ) + .use_index(use_index) + .build() ) + if num_rows >= 0: + options.set_num_rows(num_rows) + if skiprows >= 0: + options.set_skip_rows(skiprows) + if stripes is not None and len(stripes) > 0: + options.set_stripes(stripes) + if timestamp_type is not None: + options.set_timestamp_type( + dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)) + ) + if columns is not None and len(columns) > 0: + options.set_columns(columns) + + tbl_w_meta = plc.io.orc.read_orc(options) if isinstance(columns, list) and len(columns) == 0: # When `columns=[]`, index needs to be @@ -311,11 +324,35 @@ def read_orc( actual_index_names = list(index_col_names.values()) col_names = names[len(actual_index_names) :] - data, index = data_from_pylibcudf_io( - tbl_w_meta, - col_names if columns is None else names, - actual_index_names, - ) + result_col_names = col_names if columns is None else names + if actual_index_names is None: + index = None + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + result_col_names, tbl_w_meta.columns, strict=True + ) + } + else: + result_columns = [ + Column.from_pylibcudf(col) for col in tbl_w_meta.columns + ] + index = _index_from_data( + dict( + zip( + actual_index_names, + result_columns[: len(actual_index_names)], + strict=True, + ) + ) + ) + data = dict( + zip( + result_col_names, + result_columns[len(actual_index_names) :], + strict=True, + ) + ) if is_range_index: index = range_idx diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index c13489630a3..feb6e12da8c 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -23,10 +23,6 @@ import cudf from cudf._lib.column import Column -from cudf._lib.utils import ( - _data_from_columns, - data_from_pylibcudf_io, -) from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column, column_empty @@ -1238,16 +1234,11 @@ def _read_parquet( # Drop residual columns to save memory tbl._columns[i] = None - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[ - Column.from_pylibcudf(plc) - for plc in concatenated_columns - ], - column_names=column_names, - index_names=None, - ) - ) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip(column_names, concatenated_columns) + } + df = cudf.DataFrame._from_data(data) df = _process_metadata( df, column_names, @@ -1287,8 +1278,16 @@ def _read_parquet( options.set_filter(filters) tbl_w_meta = plc.io.parquet.read_parquet(options) + data = { + name: Column.from_pylibcudf(col) + for name, col in zip( + tbl_w_meta.column_names(include_children=False), + tbl_w_meta.columns, + strict=True, + ) + } - df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta)) + df = cudf.DataFrame._from_data(data) df = _process_metadata( df, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 0b09cf7dc34..a1df2c7d857 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -692,8 +692,8 @@ def assert_frame_equal( ) pd.testing.assert_index_equal( - left._data.to_pandas_index(), - right._data.to_pandas_index(), + left._data.to_pandas_index, + right._data.to_pandas_index, exact=check_column_type, check_names=check_names, check_exact=check_exact, diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 5cef077c18d..27ec4fcd1f3 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -64,7 +64,7 @@ def test_to_pandas_simple(simple_data): # Index([], dtype='object'), and `integer` for RangeIndex() # to ignore this `inferred_type` comparison, we pass exact=False. assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.DataFrame( {key: value.values_host for key, value in simple_data.items()} ).columns, @@ -75,7 +75,7 @@ def test_to_pandas_simple(simple_data): def test_to_pandas_multiindex(mi_data): ca = ColumnAccessor(mi_data, multiindex=True) assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.DataFrame( {key: value.values_host for key, value in mi_data.items()} ).columns, @@ -89,7 +89,7 @@ def test_to_pandas_multiindex_names(): level_names=("foo", "bar"), ) assert_eq( - ca.to_pandas_index(), + ca.to_pandas_index, pd.MultiIndex.from_tuples( (("a", "b"), ("c", "d")), names=("foo", "bar") ), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 51de33576c0..45b39713038 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -11224,3 +11224,32 @@ def test_dataframe_init_column(): expect = cudf.DataFrame({"a": s}) actual = cudf.DataFrame._from_arrays(s._column, columns=["a"]) assert_eq(expect, actual) + + +@pytest.mark.parametrize("name", [None, "foo", 1, 1.0]) +def test_dataframe_column_name(name): + df = cudf.DataFrame({"a": [1, 2, 3]}) + pdf = df.to_pandas() + + df.columns.name = name + pdf.columns.name = name + + assert_eq(df, pdf) + assert_eq(df.columns.name, pdf.columns.name) + + +@pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]]) +def test_dataframe_multiindex_column_names(names): + arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + + pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index) + df = cudf.from_pandas(pdf) + + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names) + pdf.columns.names = names + df.columns.names = names + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index d8a2528230e..db4f3cd3c9f 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3960,8 +3960,8 @@ def test_group_by_value_counts_with_count_column(): def test_groupby_internal_groups_empty(gdf): # test that we don't segfault when calling the internal # .groups() method with an empty list: - gb = gdf.groupby("y")._groupby - _, _, grouped_vals = gb.groups([]) + gb = gdf.groupby("y") + _, _, grouped_vals = gb._groups([]) assert grouped_vals == [] diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index aaa8d7d07ee..db34329261f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1453,3 +1453,12 @@ def test_chunked_json_reader(): with cudf.option_context("io.json.low_memory", True): gdf = cudf.read_json(buf, lines=True) assert_eq(df, gdf) + + +@pytest.mark.parametrize("compression", ["gzip", None]) +def test_roundtrip_compression(compression, tmp_path): + expected = cudf.DataFrame({"a": 1, "b": "2"}) + fle = BytesIO() + expected.to_json(fle, engine="cudf", compression=compression) + result = cudf.read_json(fle, engine="cudf", compression=compression) + assert_eq(result, expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 57bf08e6eec..ca8f9cac2d0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -151,7 +151,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: + elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index b5387ddeb5f..10ab3f6bb1e 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -15,7 +15,7 @@ def validate_setup(): import warnings - from cuda.cudart import cudaDeviceAttr, cudaError_t + from cuda.bindings.runtime import cudaDeviceAttr, cudaError_t from rmm._cuda.gpu import ( CUDARuntimeError, diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py index 0fd632507a6..ba98273404d 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py @@ -113,9 +113,6 @@ def test_with_external_memory( return predt -@pytest.mark.skip( - reason="TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly." -) @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_predict(device: str) -> np.ndarray: reg = xgb.XGBRegressor(n_estimators=2, device=device) @@ -127,6 +124,11 @@ def test_predict(device: str) -> np.ndarray: predt0 = reg.predict(X_df) predt1 = booster.inplace_predict(X_df) + # After https://github.com/dmlc/xgboost/pull/11014, .inplace_predict() + # returns a real cupy array when called on a cudf.pandas proxy dataframe. + # So we need to ensure we have a valid numpy array. + if not isinstance(predt1, np.ndarray): + predt1 = predt1.get() np.testing.assert_allclose(predt0, predt1) predt2 = booster.predict(xgb.DMatrix(X_df)) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 21c18ef0174..2fdf6b34b8f 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -98,8 +98,6 @@ filterwarnings = [ "error", "ignore:::.*xdist.*", "ignore:::.*pytest.*", - # https://github.com/rapidsai/build-planning/issues/116 - "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 29fcd161444..a9d937435e9 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -60,8 +60,6 @@ addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", - # https://github.com/rapidsai/build-planning/issues/116 - "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", ] xfail_strict = true diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 29d3dc4ae79..074096446fd 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -231,7 +231,8 @@ def validate_config_options(config: dict) -> None: executor = config.get("executor", "pylibcudf") if executor == "dask-experimental": unsupported = config.get("executor_options", {}).keys() - { - "max_rows_per_partition" + "max_rows_per_partition", + "parquet_blocksize", } else: unsupported = config.get("executor_options", {}).keys() diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 624a9bd87ea..2ba483c7b2d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -40,6 +40,7 @@ def __init__( self.dtype = dtype self.name = name self.options = options + self.is_pointwise = False self.children = children if name not in Agg._SUPPORTED: raise NotImplementedError( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index 4c7ae007070..8ba3f9f407c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -36,9 +36,11 @@ class ExecutionContext(IntEnum): class Expr(Node["Expr"]): """An abstract expression object.""" - __slots__ = ("dtype",) + __slots__ = ("dtype", "is_pointwise") dtype: plc.DataType """Data type of the expression.""" + is_pointwise: bool + """Whether this expression acts pointwise on its inputs.""" # This annotation is needed because of https://github.com/python/mypy/issues/17981 _non_child: ClassVar[tuple[str, ...]] = ("dtype",) """Names of non-child data (not Exprs) for reconstruction.""" @@ -164,6 +166,7 @@ def __init__(self, dtype: plc.DataType, error: str) -> None: self.dtype = dtype self.error = error self.children = () + self.is_pointwise = True class NamedExpr: @@ -243,6 +246,7 @@ class Col(Expr): def __init__(self, dtype: plc.DataType, name: str) -> None: self.dtype = dtype self.name = name + self.is_pointwise = True self.children = () def do_evaluate( @@ -280,6 +284,7 @@ def __init__( self.dtype = dtype self.index = index self.table_ref = table_ref + self.is_pointwise = True self.children = (column,) def do_evaluate( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py index 245bdbefe88..556847b4738 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py @@ -42,6 +42,7 @@ def __init__( op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) self.op = op self.children = (left, right) + self.is_pointwise = True if not plc.binaryop.is_supported_operation( self.dtype, left.dtype, right.dtype, op ): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 5aa35ead127..d5ca22dd8d5 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -81,6 +81,14 @@ def __init__( self.options = options self.name = name self.children = children + self.is_pointwise = self.name not in ( + BooleanFunction.Name.All, + BooleanFunction.Name.Any, + BooleanFunction.Name.IsDuplicated, + BooleanFunction.Name.IsFirstDistinct, + BooleanFunction.Name.IsLastDistinct, + BooleanFunction.Name.IsUnique, + ) if self.name is BooleanFunction.Name.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index c2dddfd9940..0c3159c73d6 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -114,6 +114,7 @@ def __init__( self.options = options self.name = name self.children = children + self.is_pointwise = True if self.name not in self._COMPONENT_MAP: raise NotImplementedError(f"Temporal function {self.name}") diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py index 7eba0c110ab..8528e66c69c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -38,6 +38,7 @@ def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: assert value.type == plc.interop.to_arrow(dtype) self.value = value self.children = () + self.is_pointwise = True def do_evaluate( self, @@ -65,6 +66,7 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: data = value.to_arrow() self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) self.children = () + self.is_pointwise = True def get_hashable(self) -> Hashable: """Compute a hash of the column.""" diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py index 48c37d101f4..d4616d5d00a 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -24,6 +24,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: self.dtype = dtype self.options = options self.children = (agg,) + self.is_pointwise = False raise NotImplementedError("Rolling window not implemented") @@ -35,4 +36,5 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> N self.dtype = dtype self.options = options self.children = (agg, *by) + self.is_pointwise = False raise NotImplementedError("Grouped rolling window not implemented") diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py index 12326740f74..93ecd026eaf 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -30,6 +30,7 @@ class Gather(Expr): def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: self.dtype = dtype self.children = (values, indices) + self.is_pointwise = False def do_evaluate( self, @@ -71,6 +72,7 @@ class Filter(Expr): def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): self.dtype = dtype self.children = (values, indices) + self.is_pointwise = True def do_evaluate( self, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py index 99512e2ef52..189f109e1a2 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py @@ -32,6 +32,7 @@ def __init__( self.dtype = dtype self.options = options self.children = (column,) + self.is_pointwise = False def do_evaluate( self, @@ -71,6 +72,7 @@ def __init__( self.dtype = dtype self.options = options self.children = (column, *by) + self.is_pointwise = False def do_evaluate( self, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 124a6e8d71c..256840c1f3d 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -106,6 +106,7 @@ def __init__( self.options = options self.name = name self.children = children + self.is_pointwise = True self._validate_input() def _validate_input(self): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py index d2b5d6bae29..120ca8edce0 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -34,6 +34,7 @@ def __init__( ) -> None: self.dtype = dtype self.children = (when, then, otherwise) + self.is_pointwise = True def do_evaluate( self, diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 10caaff6811..3336c901e7f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -33,6 +33,7 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: self.dtype = dtype self.children = (value,) + self.is_pointwise = True if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}" @@ -63,6 +64,7 @@ class Len(Expr): def __init__(self, dtype: plc.DataType) -> None: self.dtype = dtype self.children = () + self.is_pointwise = False def do_evaluate( self, @@ -147,6 +149,14 @@ def __init__( self.name = name self.options = options self.children = children + self.is_pointwise = self.name not in ( + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + "drop_nulls", + "unique", + ) if self.name not in UnaryFunction._supported_fns: raise NotImplementedError(f"Unary function {name=}") diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index b5af3bb80bf..1c1d4860eec 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -604,10 +604,12 @@ def slice_skip(tbl: plc.Table): (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(paths), - lines=True, - dtypes=json_schema, - prune_columns=True, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(paths), + lines=True, + dtypes=json_schema, + prune_columns=True, + ) ) # TODO: I don't think cudf-polars supports nested types in general right now # (but when it does, we should pass child column names from nested columns in) diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py index b3248dae93c..9c45a68812a 100644 --- a/python/cudf_polars/cudf_polars/dsl/traversal.py +++ b/python/cudf_polars/cudf_polars/dsl/traversal.py @@ -10,7 +10,7 @@ from cudf_polars.typing import U_contra, V_co if TYPE_CHECKING: - from collections.abc import Callable, Generator, Mapping, MutableMapping + from collections.abc import Callable, Generator, Mapping, MutableMapping, Sequence from cudf_polars.typing import GenericTransformer, NodeT @@ -23,22 +23,22 @@ ] -def traversal(node: NodeT) -> Generator[NodeT, None, None]: +def traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]: """ Pre-order traversal of nodes in an expression. Parameters ---------- - node - Root of expression to traverse. + nodes + Roots of expressions to traverse. Yields ------ - Unique nodes in the expression, parent before child, children + Unique nodes in the expressions, parent before child, children in-order from left to right. """ - seen = {node} - lifo = [node] + seen = set(nodes) + lifo = list(nodes) while lifo: node = lifo.pop() diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py index 3a1fec36079..2a5b400af4c 100644 --- a/python/cudf_polars/cudf_polars/experimental/io.py +++ b/python/cudf_polars/cudf_polars/experimental/io.py @@ -4,18 +4,24 @@ from __future__ import annotations +import enum import math -from typing import TYPE_CHECKING +import random +from enum import IntEnum +from typing import TYPE_CHECKING, Any -from cudf_polars.dsl.ir import DataFrameScan, Union +import pylibcudf as plc + +from cudf_polars.dsl.ir import IR, DataFrameScan, Scan, Union from cudf_polars.experimental.base import PartitionInfo from cudf_polars.experimental.dispatch import lower_ir_node if TYPE_CHECKING: from collections.abc import MutableMapping - from cudf_polars.dsl.ir import IR + from cudf_polars.dsl.expr import NamedExpr from cudf_polars.experimental.dispatch import LowerIRTransformer + from cudf_polars.typing import Schema @lower_ir_node.register(DataFrameScan) @@ -47,3 +53,274 @@ def _( } return ir, {ir: PartitionInfo(count=1)} + + +class ScanPartitionFlavor(IntEnum): + """Flavor of Scan partitioning.""" + + SINGLE_FILE = enum.auto() # 1:1 mapping between files and partitions + SPLIT_FILES = enum.auto() # Split each file into >1 partition + FUSED_FILES = enum.auto() # Fuse multiple files into each partition + + +class ScanPartitionPlan: + """ + Scan partitioning plan. + + Notes + ----- + The meaning of `factor` depends on the value of `flavor`: + - SINGLE_FILE: `factor` must be `1`. + - SPLIT_FILES: `factor` is the number of partitions per file. + - FUSED_FILES: `factor` is the number of files per partition. + """ + + __slots__ = ("factor", "flavor") + factor: int + flavor: ScanPartitionFlavor + + def __init__(self, factor: int, flavor: ScanPartitionFlavor) -> None: + if ( + flavor == ScanPartitionFlavor.SINGLE_FILE and factor != 1 + ): # pragma: no cover + raise ValueError(f"Expected factor == 1 for {flavor}, got: {factor}") + self.factor = factor + self.flavor = flavor + + @staticmethod + def from_scan(ir: Scan) -> ScanPartitionPlan: + """Extract the partitioning plan of a Scan operation.""" + if ir.typ == "parquet": + # TODO: Use system info to set default blocksize + parallel_options = ir.config_options.get("executor_options", {}) + blocksize: int = parallel_options.get("parquet_blocksize", 1024**3) + stats = _sample_pq_statistics(ir) + file_size = sum(float(stats[column]) for column in ir.schema) + if file_size > 0: + if file_size > blocksize: + # Split large files + return ScanPartitionPlan( + math.ceil(file_size / blocksize), + ScanPartitionFlavor.SPLIT_FILES, + ) + else: + # Fuse small files + return ScanPartitionPlan( + max(blocksize // int(file_size), 1), + ScanPartitionFlavor.FUSED_FILES, + ) + + # TODO: Use file sizes for csv and json + return ScanPartitionPlan(1, ScanPartitionFlavor.SINGLE_FILE) + + +class SplitScan(IR): + """ + Input from a split file. + + This class wraps a single-file `Scan` object. At + IO/evaluation time, this class will only perform + a partial read of the underlying file. The range + (skip_rows and n_rows) is calculated at IO time. + """ + + __slots__ = ( + "base_scan", + "schema", + "split_index", + "total_splits", + ) + _non_child = ( + "schema", + "base_scan", + "split_index", + "total_splits", + ) + base_scan: Scan + """Scan operation this node is based on.""" + split_index: int + """Index of the current split.""" + total_splits: int + """Total number of splits.""" + + def __init__( + self, schema: Schema, base_scan: Scan, split_index: int, total_splits: int + ): + self.schema = schema + self.base_scan = base_scan + self.split_index = split_index + self.total_splits = total_splits + self._non_child_args = ( + split_index, + total_splits, + *base_scan._non_child_args, + ) + self.children = () + if base_scan.typ not in ("parquet",): # pragma: no cover + raise NotImplementedError( + f"Unhandled Scan type for file splitting: {base_scan.typ}" + ) + + @classmethod + def do_evaluate( + cls, + split_index: int, + total_splits: int, + schema: Schema, + typ: str, + reader_options: dict[str, Any], + config_options: dict[str, Any], + paths: list[str], + with_columns: list[str] | None, + skip_rows: int, + n_rows: int, + row_index: tuple[str, int] | None, + predicate: NamedExpr | None, + ): + """Evaluate and return a dataframe.""" + if typ not in ("parquet",): # pragma: no cover + raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}") + + if len(paths) > 1: # pragma: no cover + raise ValueError(f"Expected a single path, got: {paths}") + + # Parquet logic: + # - We are one of "total_splits" SplitScan nodes + # assigned to the same file. + # - We know our index within this file ("split_index") + # - We can also use parquet metadata to query the + # total number of rows in each row-group of the file. + # - We can use all this information to calculate the + # "skip_rows" and "n_rows" options to use locally. + + rowgroup_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(paths) + ).rowgroup_metadata() + total_row_groups = len(rowgroup_metadata) + if total_splits <= total_row_groups: + # We have enough row-groups in the file to align + # all "total_splits" of our reads with row-group + # boundaries. Calculate which row-groups to include + # in the current read, and use metadata to translate + # the row-group indices to "skip_rows" and "n_rows". + rg_stride = total_row_groups // total_splits + skip_rgs = rg_stride * split_index + skip_rows = sum(rg["num_rows"] for rg in rowgroup_metadata[:skip_rgs]) + n_rows = sum( + rg["num_rows"] + for rg in rowgroup_metadata[skip_rgs : skip_rgs + rg_stride] + ) + else: + # There are not enough row-groups to align + # all "total_splits" of our reads with row-group + # boundaries. Use metadata to directly calculate + # "skip_rows" and "n_rows" for the current read. + total_rows = sum(rg["num_rows"] for rg in rowgroup_metadata) + n_rows = total_rows // total_splits + skip_rows = n_rows * split_index + + # Last split should always read to end of file + if split_index == (total_splits - 1): + n_rows = -1 + + # Perform the partial read + return Scan.do_evaluate( + schema, + typ, + reader_options, + config_options, + paths, + with_columns, + skip_rows, + n_rows, + row_index, + predicate, + ) + + +def _sample_pq_statistics(ir: Scan) -> dict[str, float]: + import numpy as np + import pyarrow.dataset as pa_ds + + # Use average total_uncompressed_size of three files + # TODO: Use plc.io.parquet_metadata.read_parquet_metadata + n_sample = 3 + column_sizes = {} + ds = pa_ds.dataset(random.sample(ir.paths, n_sample), format="parquet") + for i, frag in enumerate(ds.get_fragments()): + md = frag.metadata + for rg in range(md.num_row_groups): + row_group = md.row_group(rg) + for col in range(row_group.num_columns): + column = row_group.column(col) + name = column.path_in_schema + if name not in column_sizes: + column_sizes[name] = np.zeros(n_sample, dtype="int64") + column_sizes[name][i] += column.total_uncompressed_size + + return {name: np.mean(sizes) for name, sizes in column_sizes.items()} + + +@lower_ir_node.register(Scan) +def _( + ir: Scan, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + partition_info: MutableMapping[IR, PartitionInfo] + if ir.typ in ("csv", "parquet", "ndjson") and ir.n_rows == -1 and ir.skip_rows == 0: + plan = ScanPartitionPlan.from_scan(ir) + paths = list(ir.paths) + if plan.flavor == ScanPartitionFlavor.SPLIT_FILES: + # Disable chunked reader when splitting files + config_options = ir.config_options.copy() + config_options["parquet_options"] = config_options.get( + "parquet_options", {} + ).copy() + config_options["parquet_options"]["chunked"] = False + + slices: list[SplitScan] = [] + for path in paths: + base_scan = Scan( + ir.schema, + ir.typ, + ir.reader_options, + ir.cloud_options, + config_options, + [path], + ir.with_columns, + ir.skip_rows, + ir.n_rows, + ir.row_index, + ir.predicate, + ) + slices.extend( + SplitScan(ir.schema, base_scan, sindex, plan.factor) + for sindex in range(plan.factor) + ) + new_node = Union(ir.schema, None, *slices) + partition_info = {slice: PartitionInfo(count=1) for slice in slices} | { + new_node: PartitionInfo(count=len(slices)) + } + else: + groups: list[Scan] = [ + Scan( + ir.schema, + ir.typ, + ir.reader_options, + ir.cloud_options, + ir.config_options, + paths[i : i + plan.factor], + ir.with_columns, + ir.skip_rows, + ir.n_rows, + ir.row_index, + ir.predicate, + ) + for i in range(0, len(paths), plan.factor) + ] + new_node = Union(ir.schema, None, *groups) + partition_info = {group: PartitionInfo(count=1) for group in groups} | { + new_node: PartitionInfo(count=len(groups)) + } + return new_node, partition_info + + return ir, {ir: PartitionInfo(count=1)} # pragma: no cover diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py index e5884f1c574..6843ed9ee2e 100644 --- a/python/cudf_polars/cudf_polars/experimental/parallel.py +++ b/python/cudf_polars/cudf_polars/experimental/parallel.py @@ -9,8 +9,9 @@ from functools import reduce from typing import TYPE_CHECKING, Any -import cudf_polars.experimental.io # noqa: F401 -from cudf_polars.dsl.ir import IR, Cache, Projection, Union +import cudf_polars.experimental.io +import cudf_polars.experimental.select # noqa: F401 +from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union from cudf_polars.dsl.traversal import CachingVisitor, traversal from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name from cudf_polars.experimental.dispatch import ( @@ -112,7 +113,7 @@ def task_graph( """ graph = reduce( operator.or_, - (generate_ir_tasks(node, partition_info) for node in traversal(ir)), + (generate_ir_tasks(node, partition_info) for node in traversal([ir])), ) key_name = get_key_name(ir) @@ -226,6 +227,8 @@ def _lower_ir_pwise( lower_ir_node.register(Projection, _lower_ir_pwise) lower_ir_node.register(Cache, _lower_ir_pwise) +lower_ir_node.register(Filter, _lower_ir_pwise) +lower_ir_node.register(HStack, _lower_ir_pwise) def _generate_ir_tasks_pwise( @@ -245,3 +248,6 @@ def _generate_ir_tasks_pwise( generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise) generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise) +generate_ir_tasks.register(Filter, _generate_ir_tasks_pwise) +generate_ir_tasks.register(HStack, _generate_ir_tasks_pwise) +generate_ir_tasks.register(Select, _generate_ir_tasks_pwise) diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py new file mode 100644 index 00000000000..5f79384b569 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/select.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Parallel Select Logic.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from cudf_polars.dsl.ir import Select +from cudf_polars.dsl.traversal import traversal +from cudf_polars.experimental.dispatch import lower_ir_node + +if TYPE_CHECKING: + from collections.abc import MutableMapping + + from cudf_polars.dsl.ir import IR + from cudf_polars.experimental.base import PartitionInfo + from cudf_polars.experimental.parallel import LowerIRTransformer + + +@lower_ir_node.register(Select) +def _( + ir: Select, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + child, partition_info = rec(ir.children[0]) + pi = partition_info[child] + if pi.count > 1 and not all( + expr.is_pointwise for expr in traversal([e.value for e in ir.exprs]) + ): + # TODO: Handle non-pointwise expressions. + raise NotImplementedError( + f"Selection {ir} does not support multiple partitions." + ) + new_node = ir.reconstruct([child]) + partition_info[new_node] = pi + return new_node, partition_info diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index b781b13ec10..5904942aea2 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -66,8 +66,6 @@ addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", - # https://github.com/rapidsai/build-planning/issues/116 - "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", ] xfail_strict = true diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 9755994c419..9fcca2e290e 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -32,21 +32,21 @@ def test_traversal_unique(): dt = plc.DataType(plc.TypeId.INT8) e1 = make_expr(dt, "a", "a") - unique_exprs = list(traversal(e1)) + unique_exprs = list(traversal([e1])) assert len(unique_exprs) == 2 assert set(unique_exprs) == {expr.Col(dt, "a"), e1} assert unique_exprs == [e1, expr.Col(dt, "a")] e2 = make_expr(dt, "a", "b") - unique_exprs = list(traversal(e2)) + unique_exprs = list(traversal([e2])) assert len(unique_exprs) == 3 assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2} assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")] e3 = make_expr(dt, "b", "a") - unique_exprs = list(traversal(e3)) + unique_exprs = list(traversal([e3])) assert len(unique_exprs) == 3 assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3} diff --git a/python/cudf_polars/tests/experimental/test_scan.py b/python/cudf_polars/tests/experimental/test_scan.py new file mode 100644 index 00000000000..a26d751dc86 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_scan.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import Translator +from cudf_polars.experimental.parallel import lower_ir_graph +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(scope="module") +def df(): + return pl.DataFrame( + { + "x": range(3_000), + "y": ["cat", "dog", "fish"] * 1_000, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 600, + } + ) + + +def make_source(df, path, fmt, n_files=3): + n_rows = len(df) + stride = int(n_rows / n_files) + for i in range(n_files): + offset = stride * i + part = df.slice(offset, stride) + if fmt == "csv": + part.write_csv(path / f"part.{i}.csv") + elif fmt == "ndjson": + part.write_ndjson(path / f"part.{i}.ndjson") + else: + part.write_parquet( + path / f"part.{i}.parquet", + row_group_size=int(stride / 2), + ) + + +@pytest.mark.parametrize( + "fmt, scan_fn", + [ + ("csv", pl.scan_csv), + ("ndjson", pl.scan_ndjson), + ("parquet", pl.scan_parquet), + ], +) +def test_parallel_scan(tmp_path, df, fmt, scan_fn): + make_source(df, tmp_path, fmt) + q = scan_fn(tmp_path) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + ) + assert_gpu_result_equal(q, engine=engine) + + +@pytest.mark.parametrize("blocksize", [1_000, 10_000, 1_000_000]) +def test_parquet_blocksize(tmp_path, df, blocksize): + n_files = 3 + make_source(df, tmp_path, "parquet", n_files) + q = pl.scan_parquet(tmp_path) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + executor_options={"parquet_blocksize": blocksize}, + ) + assert_gpu_result_equal(q, engine=engine) + + # Check partitioning + qir = Translator(q._ldf.visit(), engine).translate_ir() + ir, info = lower_ir_graph(qir) + count = info[ir].count + if blocksize <= 12_000: + assert count > n_files + else: + assert count < n_files diff --git a/python/cudf_polars/tests/experimental/test_select.py b/python/cudf_polars/tests/experimental/test_select.py new file mode 100644 index 00000000000..7dfe6ead148 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_select.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(scope="module") +def engine(): + return pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + executor_options={"max_rows_per_partition": 3}, + ) + + +@pytest.fixture(scope="module") +def df(): + return pl.LazyFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7], + "b": [1, 1, 1, 1, 1, 1, 1], + } + ) + + +def test_select(df, engine): + query = df.select( + pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d") + ) + assert_gpu_result_equal(query, engine=engine) + + +def test_select_reduce_raises(df, engine): + query = df.select( + (pl.col("a") + pl.col("b")).max(), + (pl.col("a") * 2 + pl.col("b")).alias("d").mean(), + ) + with pytest.raises( + pl.exceptions.ComputeError, + match="NotImplementedError", + ): + assert_gpu_result_equal(query, engine=engine) + + +def test_select_with_cse_no_agg(df, engine): + expr = pl.col("a") + pl.col("a") + query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c")) + assert_gpu_result_equal(query, engine=engine) diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index ed43ab83d53..7820157d89b 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -93,8 +93,6 @@ addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", - # https://github.com/rapidsai/build-planning/issues/116 - "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", "ignore:unclosed