diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 78f529a44d3..9cbacee8e8d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() # Not using rapids-cmake since we never want to find, always download. CPMAddPackage( - NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 ) rapids_make_logger(cudf EXPORT_SET cudf-exports) @@ -916,7 +916,9 @@ if(CUDF_LARGE_STRINGS_DISABLED) endif() # Define logging level -target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") +target_compile_definitions( + cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=CUDF_LOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}" +) # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index c440643037b..b0c48e04710 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,11 +14,6 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") - if(NOT BUILD_SHARED_LIBS) set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) else() @@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow) nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb + GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff deleted file mode 100644 index e9a36fcb567..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h -index caa6be4..70ec8a2 100644 ---- a/src/nanoarrow/common/inline_buffer.h -+++ b/src/nanoarrow/common/inline_buffer.h -@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - } - - static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { -- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | -+ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); -- bits[bytes_begin] &= only_byte_mask; -+ bits[bytes_begin] &= only_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte -- bits[bytes_begin] &= first_byte_mask; -+ bits[bytes_begin] &= first_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { -@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte -- *out_cursor = 0x00; -+ *out_cursor = 0x00; // NOLINT - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json deleted file mode 100644 index d529787e7c8..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "packages" : { - "nanoarrow" : { - "version" : "0.6.0.dev", - "git_url" : "https://github.com/apache/arrow-nanoarrow.git", - "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", - "git_shallow" : false, - "patches" : [ - { - "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", - "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index ea480b133dc..aacb5ccfede 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -444,7 +444,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return string_view instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset char const* d_strings = static_cast(_data); @@ -503,7 +503,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return dictionary32 instance representing this element at this index */ template )> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset auto const indices = d_children[0]; @@ -521,7 +521,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return numeric::fixed_point representing the element at this index */ template ())> - __device__ [[nodiscard]] T element(size_type element_index) const noexcept + [[nodiscard]] __device__ T element(size_type element_index) const noexcept { using namespace numeric; using rep = typename T::rep; @@ -1034,7 +1034,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return Reference to the element at the specified index */ template ())> - __device__ [[nodiscard]] T& element(size_type element_index) const noexcept + [[nodiscard]] __device__ T& element(size_type element_index) const noexcept { return data()[element_index]; } @@ -1427,13 +1427,13 @@ struct pair_rep_accessor { private: template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i); } template , void>* = nullptr> - __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const + [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const { return col.element(i).value(); } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index c30c3d6f4bd..59011f7b138 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -31,7 +32,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -216,12 +216,12 @@ struct identity_initializer { * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. - * @param aggs A vector of aggregation operations corresponding to the table + * @param aggs A span of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp index 16ca06c6561..090dc8b62b6 100644 --- a/cpp/include/cudf/detail/device_scalar.hpp +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar { [[nodiscard]] T value(rmm::cuda_stream_view stream) const { cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); - return bounce_buffer[0]; + return std::move(bounce_buffer[0]); } void set_value_async(T const& value, rmm::cuda_stream_view stream) diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 504c31057ae..33f3176d2c6 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -54,7 +54,7 @@ class string_view { * * @return The number of characters in this string */ - __device__ [[nodiscard]] inline size_type length() const; + [[nodiscard]] __device__ inline size_type length() const; /** * @brief Return a pointer to the internal device array * @@ -119,13 +119,13 @@ class string_view { * * @return new iterator pointing to the beginning of this string */ - __device__ [[nodiscard]] inline const_iterator begin() const; + [[nodiscard]] __device__ inline const_iterator begin() const; /** * @brief Return new iterator pointing past the end of this string * * @return new iterator pointing past the end of this string */ - __device__ [[nodiscard]] inline const_iterator end() const; + [[nodiscard]] __device__ inline const_iterator end() const; /** * @brief Return single UTF-8 character at the given character position @@ -140,7 +140,7 @@ class string_view { * @param pos Character position * @return Byte offset from data() for a given character position */ - __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const; + [[nodiscard]] __device__ inline size_type byte_offset(size_type pos) const; /** * @brief Comparing target string with this string. Each character is compared @@ -155,7 +155,7 @@ class string_view { * not match is greater in the arg string, or all compared characters * match but the arg string is longer. */ - __device__ [[nodiscard]] inline int compare(string_view const& str) const; + [[nodiscard]] __device__ inline int compare(string_view const& str) const; /** * @brief Comparing target string with this string. Each character is compared * as a UTF-8 code-point value. @@ -225,7 +225,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if str is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(string_view const& str, + [[nodiscard]] __device__ inline size_type find(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -253,7 +253,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type find(char_utf8 character, + [[nodiscard]] __device__ inline size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const; /** @@ -266,7 +266,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(string_view const& str, + [[nodiscard]] __device__ inline size_type rfind(string_view const& str, size_type pos = 0, size_type count = -1) const; /** @@ -294,7 +294,7 @@ class string_view { * Specify -1 to indicate to the end of the string. * @return npos if arg string is not found in this string. */ - __device__ [[nodiscard]] inline size_type rfind(char_utf8 character, + [[nodiscard]] __device__ inline size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const; @@ -306,7 +306,7 @@ class string_view { * @param length Number of characters from start to include in the sub-string. * @return New instance pointing to a subset of the characters within this instance. */ - __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const; + [[nodiscard]] __device__ inline string_view substr(size_type start, size_type length) const; /** * @brief Return minimum value associated with the string type @@ -386,7 +386,7 @@ class string_view { * @param bytepos Byte position from start of _data. * @return The character position for the specified byte. */ - __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const; + [[nodiscard]] __device__ inline size_type character_offset(size_type bytepos) const; /** * @brief Common internal implementation for string_view::find and string_view::rfind. diff --git a/cpp/include/cudf/utilities/memory_resource.hpp b/cpp/include/cudf/utilities/memory_resource.hpp index b562574fd79..eaba466557b 100644 --- a/cpp/include/cudf/utilities/memory_resource.hpp +++ b/cpp/include/cudf/utilities/memory_resource.hpp @@ -16,8 +16,6 @@ #pragma once -#include - #include #include #include diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index d915c85bf85..3a6ff36c424 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -17,15 +17,14 @@ #include #include #include +#include #include -#include - namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream) { // TODO: Initialize all the columns in a single kernel instead of invoking one diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp index 7eb80c4249e..d36dacca739 100644 --- a/cpp/src/bitmask/is_element_valid.cpp +++ b/cpp/src/bitmask/is_element_valid.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -30,15 +31,14 @@ bool is_element_valid_sync(column_view const& col_view, CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index."); if (!col_view.nullable()) { return true; } - bitmask_type word = 0; // null_mask() returns device ptr to bitmask without offset size_type const index = element_index + col_view.offset(); - CUDF_CUDA_TRY(cudaMemcpyAsync(&word, - col_view.null_mask() + word_index(index), - sizeof(bitmask_type), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + + auto const word = + cudf::detail::make_host_vector_sync( + device_span{col_view.null_mask() + word_index(index), 1}, stream) + .front(); + return static_cast(word & (bitmask_type{1} << intra_word_index(index))); } diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu index fc244521617..9dc39f01ab3 100644 --- a/cpp/src/column/column_device_view.cu +++ b/cpp/src/column/column_device_view.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -60,13 +61,12 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is copied to device memory // and then set into the d_children member pointer. - std::vector staging_buffer(descendant_storage_bytes); + auto staging_buffer = detail::make_host_vector(descendant_storage_bytes, stream); // Each ColumnDeviceView instance may have child objects that // require setting some internal device pointers before being copied // from CPU to device. - rmm::device_buffer* const descendant_storage = - new rmm::device_buffer(descendant_storage_bytes, stream); + auto const descendant_storage = new rmm::device_uvector(descendant_storage_bytes, stream); auto deleter = [descendant_storage](ColumnDeviceView* v) { v->destroy(); @@ -77,13 +77,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter}; // copy the CPU memory with all the children into device memory - CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(), - staging_buffer.data(), - descendant_storage->size(), - cudaMemcpyDefault, - stream.value())); - - stream.synchronize(); + detail::cuda_memcpy(*descendant_storage, staging_buffer, stream); return result; } diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index e3ed5b55415..3413f75357b 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -998,7 +998,8 @@ struct packed_split_indices_and_src_buf_info { src_buf_info_size( cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)), // host-side - h_indices_and_source_info(indices_size + src_buf_info_size), + h_indices_and_source_info{ + detail::make_host_vector(indices_size + src_buf_info_size, stream)}, h_indices{reinterpret_cast(h_indices_and_source_info.data())}, h_src_buf_info{ reinterpret_cast(h_indices_and_source_info.data() + indices_size)} @@ -1025,15 +1026,18 @@ struct packed_split_indices_and_src_buf_info { reinterpret_cast(reinterpret_cast(d_indices_and_source_info.data()) + indices_size + src_buf_info_size); - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value())); + detail::cuda_memcpy_async( + device_span{static_cast(d_indices_and_source_info.data()), + h_indices_and_source_info.size()}, + h_indices_and_source_info, + stream); } size_type const indices_size; std::size_t const src_buf_info_size; std::size_t offset_stack_size; - std::vector h_indices_and_source_info; + detail::host_vector h_indices_and_source_info; rmm::device_buffer d_indices_and_source_info; size_type* const h_indices; @@ -1055,27 +1059,26 @@ struct packed_partition_buf_size_and_dst_buf_info { buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)}, dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)}, // host-side - h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size), + h_buf_sizes_and_dst_info{ + detail::make_host_vector(buf_sizes_size + dst_buf_info_size, stream)}, h_buf_sizes{reinterpret_cast(h_buf_sizes_and_dst_info.data())}, h_dst_buf_info{ - reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size)}, + reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size), + num_bufs, + h_buf_sizes_and_dst_info.get_allocator().is_device_accessible()}, // device-side - d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr), + d_buf_sizes_and_dst_info(h_buf_sizes_and_dst_info.size(), stream, temp_mr), d_buf_sizes{reinterpret_cast(d_buf_sizes_and_dst_info.data())}, // destination buffer info - d_dst_buf_info{reinterpret_cast( - static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)} + d_dst_buf_info{ + reinterpret_cast(d_buf_sizes_and_dst_info.data() + buf_sizes_size), num_bufs} { } void copy_to_host() { // DtoH buf sizes and col info back to the host - CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes, - d_buf_sizes, - buf_sizes_size + dst_buf_info_size, - cudaMemcpyDefault, - stream.value())); + detail::cuda_memcpy_async(h_buf_sizes_and_dst_info, d_buf_sizes_and_dst_info, stream); } rmm::cuda_stream_view const stream; @@ -1084,13 +1087,13 @@ struct packed_partition_buf_size_and_dst_buf_info { std::size_t const buf_sizes_size; std::size_t const dst_buf_info_size; - std::vector h_buf_sizes_and_dst_info; + detail::host_vector h_buf_sizes_and_dst_info; std::size_t* const h_buf_sizes; - dst_buf_info* const h_dst_buf_info; + host_span const h_dst_buf_info; - rmm::device_buffer d_buf_sizes_and_dst_info; + rmm::device_uvector d_buf_sizes_and_dst_info; std::size_t* const d_buf_sizes; - dst_buf_info* const d_dst_buf_info; + device_span const d_dst_buf_info; }; // Packed block of memory 3: @@ -1106,11 +1109,12 @@ struct packed_src_and_dst_pointers { src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)}, dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)}, // host-side - h_src_and_dst_buffers(src_bufs_size + dst_bufs_size), + h_src_and_dst_buffers{ + detail::make_host_vector(src_bufs_size + dst_bufs_size, stream)}, h_src_bufs{reinterpret_cast(h_src_and_dst_buffers.data())}, h_dst_bufs{reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size)}, // device-side - d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)}, + d_src_and_dst_buffers{h_src_and_dst_buffers.size(), stream, temp_mr}, d_src_bufs{reinterpret_cast(d_src_and_dst_buffers.data())}, d_dst_bufs{reinterpret_cast( reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size)} @@ -1121,18 +1125,18 @@ struct packed_src_and_dst_pointers { void copy_to_device() { - CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(), - h_src_and_dst_buffers.data(), - src_bufs_size + dst_bufs_size, - cudaMemcpyDefault, - stream.value())); + detail::cuda_memcpy_async( + device_span{static_cast(d_src_and_dst_buffers.data()), + d_src_and_dst_buffers.size()}, + h_src_and_dst_buffers, + stream); } rmm::cuda_stream_view const stream; std::size_t const src_bufs_size; std::size_t const dst_bufs_size; - std::vector h_src_and_dst_buffers; + detail::host_vector h_src_and_dst_buffers; uint8_t const** const h_src_bufs; uint8_t** const h_dst_bufs; @@ -1205,7 +1209,7 @@ std::unique_ptr compute_splits( std::make_unique( num_partitions, num_bufs, stream, temp_mr); - auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info.begin(); auto const d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes; auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info( @@ -1518,26 +1522,19 @@ std::unique_ptr chunk_iteration_state::create( */ if (user_buffer_size != 0) { // copy the batch offsets back to host - std::vector h_offsets(num_batches + 1); - { - rmm::device_uvector offsets(h_offsets.size(), stream, temp_mr); + auto const h_offsets = [&] { + rmm::device_uvector offsets(num_batches + 1, stream, temp_mr); auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator( 0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()}); - thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr), + thrust::exclusive_scan(rmm::exec_policy_nosync(stream, temp_mr), batch_byte_size_iter, - batch_byte_size_iter + num_batches + 1, + batch_byte_size_iter + offsets.size(), offsets.begin()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(), - offsets.data(), - sizeof(std::size_t) * offsets.size(), - cudaMemcpyDefault, - stream.value())); - // the next part is working on the CPU, so we want to synchronize here - stream.synchronize(); - } + return detail::make_host_vector_sync(offsets, stream); + }(); std::vector num_batches_per_iteration; std::vector size_of_batches_per_iteration; @@ -1699,7 +1696,7 @@ void copy_data(int num_batches_to_copy, int starting_batch, uint8_t const** d_src_bufs, uint8_t** d_dst_bufs, - rmm::device_uvector& d_dst_buf_info, + device_span d_dst_buf_info, uint8_t* user_buffer, rmm::cuda_stream_view stream) { @@ -1833,15 +1830,9 @@ struct contiguous_split_state { keys + num_batches_total, values, thrust::make_discard_iterator(), - dst_valid_count_output_iterator{d_orig_dst_buf_info}); - - CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info, - d_orig_dst_buf_info, - partition_buf_size_and_dst_buf_info->dst_buf_info_size, - cudaMemcpyDefault, - stream.value())); + dst_valid_count_output_iterator{d_orig_dst_buf_info.begin()}); - stream.synchronize(); + detail::cuda_memcpy(h_orig_dst_buf_info, d_orig_dst_buf_info, stream); // not necessary for the non-chunked case, but it makes it so further calls to has_next // return false, just in case @@ -1889,7 +1880,7 @@ struct contiguous_split_state { } auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; - auto cur_dst_buf_info = h_dst_buf_info; + auto cur_dst_buf_info = h_dst_buf_info.data(); detail::metadata_builder mb{input.num_columns()}; populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb); @@ -1927,7 +1918,7 @@ struct contiguous_split_state { // Second pass: uses `dst_buf_info` to break down the work into 1MB batches. chunk_iter_state = compute_batches(num_bufs, - partition_buf_size_and_dst_buf_info->d_dst_buf_info, + partition_buf_size_and_dst_buf_info->d_dst_buf_info.data(), partition_buf_size_and_dst_buf_info->h_buf_sizes, num_partitions, user_buffer_size, @@ -1963,7 +1954,7 @@ struct contiguous_split_state { auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; auto& h_dst_bufs = src_and_dst_pointers->h_dst_bufs; - auto cur_dst_buf_info = h_dst_buf_info; + auto cur_dst_buf_info = h_dst_buf_info.data(); detail::metadata_builder mb(input.num_columns()); for (std::size_t idx = 0; idx < num_partitions; idx++) { diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index e8b29a0e7a8..9c9a4c97bff 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -60,7 +60,7 @@ rmm::device_uvector compute_aggregations( rmm::cuda_stream_view stream) { // flatten the aggs to a table that can be operated on by aggregate_row - auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream); auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index 6025686953e..d2830f7d905 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 00db149c6d9..671ee2ea31f 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 0777b9ffd93..437823a3fea 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 209e2b7f20a..7cb3f8f190b 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, nullable_global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index bc32e306b3f..a835736235c 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, @@ -107,7 +108,7 @@ template void extract_populated_keys( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, global_set_t const& global_set, rmm::device_uvector& populated_keys, @@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, nullable_global_set_t const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index 8155ce852e0..4e2fa81bdb7 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -20,12 +20,11 @@ #include #include #include +#include #include #include -#include - namespace cudf::groupby::detail::hash { /** * @brief Computes and returns a device vector containing all populated keys in @@ -47,7 +46,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp index b2048a9fbb8..a533f7a6448 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -102,12 +103,15 @@ class groupby_simple_aggregations_collector final }; // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; - std::vector agg_kinds; + auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); for (auto const& request : requests) { auto const& agg_v = request.aggregations; diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp index dfad51f27d4..e3c17ca972c 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -26,7 +26,10 @@ namespace cudf::groupby::detail::hash { // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests); +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 37a61c1a22c..b71e20938d6 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -170,7 +170,8 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation c cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + cudf::detail::initialize_with_identity( + var_table_view, host_span(&agg.kind, 1), stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index 86835ea8a67..5082ad01327 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -107,7 +107,10 @@ struct group_scan_functor() if (values.is_empty()) { return result; } auto result_table = mutable_table_view({*result}); - cudf::detail::initialize_with_identity(result_table, {K}, stream); + // Need an address of the aggregation kind to pass to the span + auto const kind = K; + cudf::detail::initialize_with_identity( + result_table, host_span(&kind, 1), stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); auto values_view = column_device_view::create(values, stream); diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 7f0b5e07b09..e05353ee822 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -21,13 +21,13 @@ #include "csv_common.hpp" #include "csv_gpu.hpp" -#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/io_uncomp.hpp" #include "io/utilities/column_buffer.hpp" #include "io/utilities/hostdevice_vector.hpp" #include "io/utilities/parsing_utils.cuh" #include +#include #include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 8e532b01788..6b9c19368dc 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -19,7 +19,6 @@ * @brief cuDF-IO ORC writer class implementation */ -#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" @@ -30,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 98fd9f679c8..21d8c95e199 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -218,9 +218,8 @@ struct minmax_functor { auto dev_result = reduce(col, stream); // copy the minmax_pair to the host; does not copy the strings using OutputType = minmax_pair; - OutputType host_result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value())); + + auto const host_result = dev_result.value(stream); // strings are copied to create the scalars here return {std::make_unique(host_result.min_val, true, stream, mr), std::make_unique(host_result.max_val, true, stream, mr)}; @@ -236,10 +235,8 @@ struct minmax_functor { // compute minimum and maximum values auto dev_result = reduce(col, stream); // copy the minmax_pair to the host to call get_element - using OutputType = minmax_pair; - OutputType host_result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value())); + using OutputType = minmax_pair; + OutputType host_result = dev_result.value(stream); // get the keys for those indexes auto const keys = dictionary_column_view(col).keys(); return {detail::get_element(keys, static_cast(host_result.min_val), stream, mr), diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 4ec2174a96f..4b0b08fe251 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -114,11 +114,10 @@ string_scalar::operator std::string() const { return this->to_string(cudf::get_d std::string string_scalar::to_string(rmm::cuda_stream_view stream) const { - std::string result; - result.resize(_data.size()); - CUDF_CUDA_TRY( - cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDefault, stream.value())); - stream.synchronize(); + std::string result(size(), '\0'); + detail::cuda_memcpy(host_span{result.data(), result.size()}, + device_span{data(), _data.size()}, + stream); return result; } diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh index 2df404048f7..d22fb04696c 100644 --- a/cpp/src/strings/regex/regex.cuh +++ b/cpp/src/strings/regex/regex.cuh @@ -186,7 +186,7 @@ class reprog_device { * Specify -1 to match any virtual positions past the end of the string. * @return If match found, returns character positions of the matches. */ - __device__ [[nodiscard]] inline match_result find(int32_t const thread_idx, + [[nodiscard]] __device__ inline match_result find(int32_t const thread_idx, string_view const d_str, string_view::const_iterator begin, cudf::size_type end = -1) const; @@ -205,7 +205,7 @@ class reprog_device { * @param group_id The specific group to return its matching position values. * @return If valid, returns the character position of the matched group in the given string, */ - __device__ [[nodiscard]] inline match_result extract(int32_t const thread_idx, + [[nodiscard]] __device__ inline match_result extract(int32_t const thread_idx, string_view const d_str, string_view::const_iterator begin, cudf::size_type end, @@ -225,17 +225,17 @@ class reprog_device { /** * @brief Returns the regex instruction object for a given id. */ - __device__ [[nodiscard]] inline reinst get_inst(int32_t id) const; + [[nodiscard]] __device__ inline reinst get_inst(int32_t id) const; /** * @brief Returns the regex class object for a given id. */ - __device__ [[nodiscard]] inline reclass_device get_class(int32_t id) const; + [[nodiscard]] __device__ inline reclass_device get_class(int32_t id) const; /** * @brief Executes the regex pattern on the given string. */ - __device__ [[nodiscard]] inline match_result regexec(string_view const d_str, + [[nodiscard]] __device__ inline match_result regexec(string_view const d_str, reljunk jnk, string_view::const_iterator begin, cudf::size_type end, @@ -244,7 +244,7 @@ class reprog_device { /** * @brief Utility wrapper to setup state memory structures for calling regexec */ - __device__ [[nodiscard]] inline match_result call_regexec( + [[nodiscard]] __device__ inline match_result call_regexec( int32_t const thread_idx, string_view const d_str, string_view::const_iterator begin, diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index e34a1e12015..906f09e4d82 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -81,11 +81,11 @@ struct alignas(8) relist { return true; } - __device__ [[nodiscard]] __forceinline__ restate get_state(int16_t idx) const + [[nodiscard]] __device__ __forceinline__ restate get_state(int16_t idx) const { return restate{ranges[idx * stride], inst_ids[idx * stride]}; } - __device__ [[nodiscard]] __forceinline__ int16_t get_size() const { return size; } + [[nodiscard]] __device__ __forceinline__ int16_t get_size() const { return size; } private: int16_t size{}; @@ -101,7 +101,7 @@ struct alignas(8) relist { mask[pos >> 3] |= uc; } - __device__ [[nodiscard]] __forceinline__ bool readMask(int32_t pos) const + [[nodiscard]] __device__ __forceinline__ bool readMask(int32_t pos) const { u_char const uc = mask[pos >> 3]; return static_cast((uc >> (pos & 7)) & 1); diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp index 3d11b641b3f..902e13fe75e 100644 --- a/cpp/src/strings/regex/regexec.cpp +++ b/cpp/src/strings/regex/regexec.cpp @@ -17,7 +17,9 @@ #include "strings/regex/regcomp.h" #include "strings/regex/regex.cuh" +#include #include +#include #include #include @@ -66,10 +68,11 @@ std::unique_ptr> reprog_devic cudf::util::round_up_safe(classes_size, sizeof(char32_t)); // allocate memory to store all the prog data in a flat contiguous buffer - std::vector h_buffer(memsize); // copy everything into here; - auto h_ptr = h_buffer.data(); // this is our running host ptr; - auto d_buffer = new rmm::device_buffer(memsize, stream); // output device memory; - auto d_ptr = reinterpret_cast(d_buffer->data()); // running device pointer + auto h_buffer = + cudf::detail::make_host_vector(memsize, stream); // copy everything into here; + auto h_ptr = h_buffer.data(); // this is our running host ptr; + auto d_buffer = new rmm::device_uvector(memsize, stream); // output device memory; + auto d_ptr = d_buffer->data(); // running device pointer // create our device object; this is managed separately and returned to the caller auto* d_prog = new reprog_device(h_prog); @@ -113,8 +116,7 @@ std::unique_ptr> reprog_devic d_prog->_prog_size = memsize + sizeof(reprog_device); // copy flat prog to device memory - CUDF_CUDA_TRY( - cudaMemcpyAsync(d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async(*d_buffer, h_buffer, stream); // build deleter to cleanup device memory auto deleter = [d_buffer](reprog_device* t) { diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu index b13ad0a7de8..ee51a426eac 100644 --- a/cpp/src/text/subword/load_hash_file.cu +++ b/cpp/src/text/subword/load_hash_file.cu @@ -19,6 +19,8 @@ #include #include +#include +#include #include #include #include @@ -198,8 +200,8 @@ std::unique_ptr load_vocabulary_file( std::getline(hash_file, line); result.num_bins = str_to_uint32(line, line_no++); - std::vector bin_coefficients(result.num_bins); - std::vector bin_offsets(result.num_bins); + auto bin_coefficients = cudf::detail::make_host_vector(result.num_bins, stream); + auto bin_offsets = cudf::detail::make_host_vector(result.num_bins, stream); for (int i = 0; i < result.num_bins; ++i) { std::getline(hash_file, line); @@ -216,7 +218,7 @@ std::unique_ptr load_vocabulary_file( std::getline(hash_file, line); uint64_t hash_table_length = str_to_uint64(line, line_no++); - std::vector table(hash_table_length); + auto table = cudf::detail::make_host_vector(hash_table_length, stream); std::generate(table.begin(), table.end(), [&hash_file, &line_no]() { std::string line; @@ -239,33 +241,32 @@ std::unique_ptr load_vocabulary_file( cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data(), - table.data(), - table.size() * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + cudf::device_span(result.table->mutable_view().data(), table.size()), + table, + stream); result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64}, bin_coefficients.size(), cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data(), - bin_coefficients.data(), - bin_coefficients.size() * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + cudf::device_span(result.bin_coefficients->mutable_view().data(), + bin_coefficients.size()), + bin_coefficients, + stream); result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16}, bin_offsets.size(), cudf::mask_state::UNALLOCATED, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data(), - bin_offsets.data(), - bin_offsets.size() * sizeof(uint16_t), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + cudf::device_span(result.bin_offsets->mutable_view().data(), + bin_offsets.size()), + bin_offsets, + stream); auto cp_metadata = detail::get_codepoint_metadata(stream); auto const cp_metadata_size = static_cast(cp_metadata.size()); diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp index 89f76237de6..e3fdc177b50 100644 --- a/cpp/tests/streams/replace_test.cpp +++ b/cpp/tests/streams/replace_test.cpp @@ -104,9 +104,9 @@ TEST_F(ReplaceTest, NormalizeNansAndZeros) TEST_F(ReplaceTest, NormalizeNansAndZerosMutable) { - auto nan = std::numeric_limits::quiet_NaN(); - auto input_column = cudf::test::make_type_param_vector({-0.0, 0.0, -nan, nan, nan}); - cudf::test::fixed_width_column_wrapper input(input_column.begin(), input_column.end()); - cudf::mutable_column_view mutable_view = cudf::column(input, cudf::test::get_default_stream()); - cudf::normalize_nans_and_zeros(mutable_view, cudf::test::get_default_stream()); + auto nan = std::numeric_limits::quiet_NaN(); + auto data = cudf::test::make_type_param_vector({-0.0, 0.0, -nan, nan, nan}); + auto input = cudf::test::fixed_width_column_wrapper(data.begin(), data.end()).release(); + auto view = input->mutable_view(); + cudf::normalize_nans_and_zeros(view, cudf::test::get_default_stream()); } diff --git a/dependencies.yaml b/dependencies.yaml index 44767f1e9d3..7a83efc6e3d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -878,7 +878,7 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numpy==1.23.* - - pyarrow==14.0.0 + - pyarrow==14.* - matrix: packages: - output_types: conda @@ -903,7 +903,7 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numpy==1.24.* - - pyarrow==14.0.1 + - pyarrow==14.* - matrix: packages: test_python_cudf_polars: diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 427ffcc8c12..410fd57691e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx - string_casting.pyx strings_udf.pyx types.pyx utils.pyx -) +set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -24,8 +22,3 @@ rapids_cython_create_modules( ) target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -target_include_directories(interop PUBLIC "$") - -include(${rapids-cmake-dir}/export/find_package_root.cmake) -include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) -target_link_libraries(interop PUBLIC nanoarrow) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 26afdd62caf..6b5a7814e48 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -2,11 +2,7 @@ import numpy as np from . import ( - copying, groupby, - interop, - stream_compaction, - string_casting, strings_udf, ) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx deleted file mode 100644 index ef544dc89eb..00000000000 --- a/python/cudf/cudf/_lib/copying.pyx +++ /dev/null @@ -1,451 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool -import pylibcudf - -import cudf -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.core.abc import Serializable -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table -import pylibcudf as plc -from pylibcudf.contiguous_split cimport PackedColumns as PlcPackedColumns - - -def _gather_map_is_valid( - gather_map: "cudf.core.column.ColumnBase", - nrows: int, - check_bounds: bool, - nullify: bool, -) -> bool: - """Returns true if gather map is valid. - - A gather map is valid if empty or all indices are within the range - ``[-nrows, nrows)``, except when ``nullify`` is specified. - """ - if not check_bounds or nullify or len(gather_map) == 0: - return True - gm_min, gm_max = gather_map.minmax() - return gm_min >= -nrows and gm_max < nrows - - -@acquire_spill_lock() -def copy_column(Column input_column): - """ - Deep copies a column - - Parameters - ---------- - input_columns : column to be copied - - Returns - ------- - Deep copied column - """ - return Column.from_pylibcudf( - input_column.to_pylibcudf(mode="read").copy() - ) - - -@acquire_spill_lock() -def _copy_range_in_place(Column input_column, - Column target_column, - size_type input_begin, - size_type input_end, - size_type target_begin): - pylibcudf.copying.copy_range( - input_column.to_pylibcudf(mode="write"), - target_column.to_pylibcudf(mode="write"), - input_begin, - input_end, - target_begin - ) - - -def _copy_range(Column input_column, - Column target_column, - size_type input_begin, - size_type input_end, - size_type target_begin): - return Column.from_pylibcudf( - pylibcudf.copying.copy_range( - input_column.to_pylibcudf(mode="read"), - target_column.to_pylibcudf(mode="read"), - input_begin, - input_end, - target_begin - ) - ) - - -@acquire_spill_lock() -def copy_range(Column source_column, - Column target_column, - size_type source_begin, - size_type source_end, - size_type target_begin, - size_type target_end, - bool inplace): - """ - Copy a contiguous range from a source to a target column - - Notes - ----- - Expects the source and target ranges to have been sanitised to be - in-range for the source and target column respectively. For - example via ``slice.indices``. - """ - - msg = "Source and target ranges must be same length" - assert source_end - source_begin == target_end - target_begin, msg - if target_end >= target_begin and inplace: - # FIXME: Are we allowed to do this when inplace=False? - return target_column - - if inplace: - _copy_range_in_place(source_column, target_column, - source_begin, source_end, target_begin) - else: - return _copy_range(source_column, target_column, - source_begin, source_end, target_begin) - - -@acquire_spill_lock() -def gather( - list columns, - Column gather_map, - bool nullify=False -): - tbl = pylibcudf.copying.gather( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]), - gather_map.to_pylibcudf(mode="read"), - pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify - else pylibcudf.copying.OutOfBoundsPolicy.DONT_CHECK - ) - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def scatter(list sources, Column scatter_map, list target_columns, - bool bounds_check=True): - """ - Scattering source into target as per the scatter map. - `source` can be a list of scalars, or a list of columns. The number of - items in `sources` must equal the number of `target_columns` to scatter. - """ - # TODO: Only single column scatter is used, we should explore multi-column - # scatter for frames for performance increase. - - if len(sources) != len(target_columns): - raise ValueError("Mismatched number of source and target columns.") - - if len(sources) == 0: - return [] - - if bounds_check: - n_rows = len(target_columns[0]) - if not ( - (scatter_map >= -n_rows).all() - and (scatter_map < n_rows).all() - ): - raise IndexError( - f"index out of bounds for column of size {n_rows}" - ) - - tbl = pylibcudf.copying.scatter( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in sources]) - if isinstance(sources[0], Column) - else [( as_device_scalar(slr)).c_value for slr in sources], - scatter_map.to_pylibcudf(mode="read"), - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]), - ) - - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def column_empty_like(Column input_column): - return Column.from_pylibcudf( - pylibcudf.copying.empty_like( - input_column.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def column_allocate_like(Column input_column, size=None): - return Column.from_pylibcudf( - pylibcudf.copying.allocate_like( - input_column.to_pylibcudf(mode="read"), - size, - ) - ) - - -@acquire_spill_lock() -def columns_empty_like(list input_columns): - return columns_from_pylibcudf_table( - pylibcudf.copying.empty_like( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]) - ) - ) - - -@acquire_spill_lock() -def column_slice(Column input_column, object indices): - return [ - Column.from_pylibcudf(c) - for c in pylibcudf.copying.slice( - input_column.to_pylibcudf(mode="read"), - list(indices), - ) - ] - - -@acquire_spill_lock() -def columns_slice(list input_columns, object indices): - return [ - columns_from_pylibcudf_table(tbl) - for tbl in pylibcudf.copying.slice( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), - list(indices), - ) - ] - - -@acquire_spill_lock() -def column_split(Column input_column, object splits): - return [ - Column.from_pylibcudf(c) - for c in pylibcudf.copying.split( - input_column.to_pylibcudf(mode="read"), - list(splits), - ) - ] - - -@acquire_spill_lock() -def columns_split(list input_columns, object splits): - return [ - columns_from_pylibcudf_table(tbl) - for tbl in pylibcudf.copying.split( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), - list(splits), - ) - ] - - -@acquire_spill_lock() -def copy_if_else(object lhs, object rhs, Column boolean_mask): - return Column.from_pylibcudf( - pylibcudf.copying.copy_if_else( - lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column) - else ( as_device_scalar(lhs)).c_value, - rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column) - else ( as_device_scalar(rhs)).c_value, - boolean_mask.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def boolean_mask_scatter(list input_, list target_columns, - Column boolean_mask): - """Copy the target columns, replacing masked rows with input data. - - The ``input_`` data can be a list of columns or as a list of scalars. - A list of input columns will be used to replace corresponding rows in the - target columns for which the boolean mask is ``True``. For the nth ``True`` - in the boolean mask, the nth row in ``input_`` is used to replace. A list - of input scalars will replace all rows in the target columns for which the - boolean mask is ``True``. - """ - if len(input_) != len(target_columns): - raise ValueError("Mismatched number of input and target columns.") - - if len(input_) == 0: - return [] - - tbl = pylibcudf.copying.boolean_mask_scatter( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_]) - if isinstance(input_[0], Column) - else [( as_device_scalar(i)).c_value for i in input_], - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]), - boolean_mask.to_pylibcudf(mode="read"), - ) - - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def shift(Column input, int offset, object fill_value=None): - cdef DeviceScalar fill - - if isinstance(fill_value, DeviceScalar): - fill = fill_value - else: - fill = as_device_scalar(fill_value, input.dtype) - - col = pylibcudf.copying.shift( - input.to_pylibcudf(mode="read"), - offset, - fill.c_value, - ) - return Column.from_pylibcudf(col) - - -@acquire_spill_lock() -def get_element(Column input_column, size_type index): - return DeviceScalar.from_pylibcudf( - pylibcudf.copying.get_element( - input_column.to_pylibcudf(mode="read"), - index, - ), - dtype=input_column.dtype, - ) - - -class PackedColumns(Serializable): - """ - A packed representation of a Frame, with all columns residing - in a single GPU memory buffer. - """ - - def __init__( - self, - PlcPackedColumns data, - object column_names = None, - object index_names = None, - object column_dtypes = None - ): - self._metadata, self._gpu_data = data.release() - self.column_names=column_names - self.index_names=index_names - self.column_dtypes=column_dtypes - - def __reduce__(self): - return self.deserialize, self.serialize() - - @property - def __cuda_array_interface__(self): - return self._gpu_data.__cuda_array_interface__ - - def serialize(self): - header = {} - frames = [] - gpu_data = as_buffer( - data = self._gpu_data.obj.ptr, - size = self._gpu_data.obj.size, - owner=self, - exposed=True - ) - data_header, data_frames = gpu_data.serialize() - header["data"] = data_header - frames.extend(data_frames) - - header["column-names"] = self.column_names - header["index-names"] = self.index_names - header["metadata"] = self._metadata.tobytes() - for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.device_serialize() - self.column_dtypes[name] = ( - dtype_header, - (len(frames), len(frames) + len(dtype_frames)), - ) - frames.extend(dtype_frames) - header["column-dtypes"] = self.column_dtypes - return header, frames - - @classmethod - def deserialize(cls, header, frames): - column_dtypes = {} - for name, dtype in header["column-dtypes"].items(): - dtype_header, (start, stop) = dtype - column_dtypes[name] = Serializable.device_deserialize( - dtype_header, frames[start:stop] - ) - return cls( - plc.contiguous_split.pack( - plc.contiguous_split.unpack_from_memoryviews( - memoryview(header["metadata"]), - plc.gpumemoryview(frames[0]), - ) - ), - header["column-names"], - header["index-names"], - column_dtypes, - ) - - @classmethod - def from_py_table(cls, input_table, keep_index=True): - if keep_index and ( - not isinstance(input_table.index, cudf.RangeIndex) - or input_table.index.start != 0 - or input_table.index.stop != len(input_table) - or input_table.index.step != 1 - ): - columns = input_table._index._columns + input_table._columns - index_names = input_table._index_names - else: - columns = input_table._columns - index_names = None - - column_names = input_table._column_names - column_dtypes = {} - for name, col in input_table._column_labels_and_values: - if isinstance( - col.dtype, - (cudf.core.dtypes._BaseDtype, cudf.core.dtypes.CategoricalDtype) - ): - column_dtypes[name] = col.dtype - - return cls( - plc.contiguous_split.pack( - plc.Table( - [ - col.to_pylibcudf(mode="read") for col in columns - ] - ) - ), - column_names, - index_names, - column_dtypes, - ) - - def unpack(self): - output_table = cudf.DataFrame._from_data(*data_from_pylibcudf_table( - plc.contiguous_split.unpack_from_memoryviews( - self._metadata, - self._gpu_data - ), - self.column_names, - self.index_names - )) - for name, dtype in self.column_dtypes.items(): - output_table._data[name] = ( - output_table._data[name]._with_type_metadata(dtype) - ) - - return output_table - - -def pack(input_table, keep_index=True): - """ - Pack the columns of a cudf Frame into a single GPU memory buffer. - """ - return PackedColumns.from_py_table(input_table, keep_index) - - -def unpack(packed): - """ - Unpack the results of packing a cudf Frame returning a new - cudf Frame in the process. - """ - return packed.unpack() diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx deleted file mode 100644 index 1c9d3a01b80..00000000000 --- a/python/cudf/cudf/_lib/interop.pyx +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ListDtype, StructDtype - - -def from_dlpack(object dlpack_capsule): - """ - Converts a DLPack Tensor PyCapsule into a list of columns. - - DLPack Tensor PyCapsule is expected to have the name "dltensor". - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_dlpack(dlpack_capsule) - ) - - -def to_dlpack(list source_columns): - """ - Converts a list of columns into a DLPack Tensor PyCapsule. - - DLPack Tensor PyCapsule will have the name "dltensor". - """ - return pylibcudf.interop.to_dlpack( - pylibcudf.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - ) - - -def gather_metadata(object cols_dtypes): - """ - Generates a ColumnMetadata vector for each column. - - Parameters - ---------- - cols_dtypes : iterable - An iterable of ``(column_name, dtype)`` pairs. - """ - cpp_metadata = [] - if cols_dtypes is not None: - for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) - if isinstance(col_dtype, (ListDtype, StructDtype)): - _set_col_children_metadata(col_dtype, cpp_metadata[idx]) - else: - raise TypeError( - "An iterable of (column_name, dtype) pairs is required to " - "construct column_metadata" - ) - return cpp_metadata - - -def _set_col_children_metadata(dtype, col_meta): - if isinstance(dtype, StructDtype): - for name, value in dtype.fields.items(): - element_metadata = pylibcudf.interop.ColumnMetadata(name) - _set_col_children_metadata(value, element_metadata) - col_meta.children_meta.append(element_metadata) - elif isinstance(dtype, ListDtype): - # Offsets - child 0 - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - # Element column - child 1 - element_metadata = pylibcudf.interop.ColumnMetadata() - _set_col_children_metadata(dtype.element_type, element_metadata) - col_meta.children_meta.append(element_metadata) - else: - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - -@acquire_spill_lock() -def to_arrow(list source_columns, object column_dtypes): - """Convert a list of columns from - cudf Frame to a PyArrow Table. - - Parameters - ---------- - source_columns : a list of columns to convert - column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs - - Returns - ------- - pyarrow table - """ - cpp_metadata = gather_metadata(column_dtypes) - return pylibcudf.interop.to_arrow( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - cpp_metadata, - ) - - -@acquire_spill_lock() -def from_arrow(object input_table): - """Convert from PyArrow Table to a list of columns. - - Parameters - ---------- - input_table : PyArrow table - - Returns - ------- - A list of columns to construct Frame object - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_arrow(input_table) - ) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 0f9820ed1db..a3a8a14e70f 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -17,9 +17,6 @@ cdef class DeviceScalar: @staticmethod cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) - @staticmethod - cdef DeviceScalar from_pylibcudf(pscalar, dtype=*) - cdef void _set_dtype(self, dtype=*) cpdef bool is_valid(DeviceScalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 56712402919..3d3bdd730a8 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -218,7 +218,7 @@ cdef class DeviceScalar: return s @staticmethod - cdef DeviceScalar from_pylibcudf(pscalar, dtype=None): + def from_pylibcudf(pscalar, dtype=None): cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) s.c_value = pscalar s._set_dtype(dtype) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx deleted file mode 100644 index 1b8831940e3..00000000000 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def drop_nulls(list columns, how="any", keys=None, thresh=None): - """ - Drops null rows from cols depending on key columns. - - Parameters - ---------- - columns : list of columns - how : "any" or "all". If thresh is None, drops rows of cols that have any - nulls or all nulls (respectively) in subset (default: "any") - keys : List of column indices. If set, then these columns are checked for - nulls rather than all of columns (optional) - thresh : Minimum number of non-nulls required to keep a row (optional) - - Returns - ------- - columns with null rows dropped - """ - if how not in {"any", "all"}: - raise ValueError("how must be 'any' or 'all'") - - keys = list(keys if keys is not None else range(len(columns))) - - # Note: If how == "all" and thresh is specified this prioritizes thresh - if thresh is not None: - keep_threshold = thresh - elif how == "all": - keep_threshold = 1 - else: - keep_threshold = len(keys) - - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.drop_nulls( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - keys, - keep_threshold, - ) - ) - - -@acquire_spill_lock() -def apply_boolean_mask(list columns, Column boolean_mask): - """ - Drops the rows which correspond to False in boolean_mask. - - Parameters - ---------- - columns : list of columns whose rows are dropped as per boolean_mask - boolean_mask : a boolean column of same size as source_table - - Returns - ------- - columns obtained from applying mask - """ - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.apply_boolean_mask( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - boolean_mask.to_pylibcudf(mode="read"), - ) - ) - - -_keep_options = { - "first": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - "last": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_LAST, - False: pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_NONE, -} - - -@acquire_spill_lock() -def drop_duplicates(list columns, - object keys=None, - object keep='first', - bool nulls_are_equal=True): - """ - Drops rows in source_table as per duplicate rows in keys. - - Parameters - ---------- - columns : List of columns - keys : List of column indices. If set, then these columns are checked for - duplicates rather than all of columns (optional) - keep : keep 'first' or 'last' or none of the duplicate rows - nulls_are_equal : if True, nulls are treated equal else not. - - Returns - ------- - columns with duplicate dropped - """ - if (keep_option := _keep_options.get(keep)) is None: - raise ValueError('keep must be either "first", "last" or False') - - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.stable_distinct( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - list(keys if keys is not None else range(len(columns))), - keep_option, - pylibcudf.types.NullEquality.EQUAL - if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL, - ) - ) - - -@acquire_spill_lock() -def distinct_indices( - list columns, - object keep="first", - bool nulls_equal=True, - bool nans_equal=True, -): - """ - Return indices of the distinct rows in a table. - - Parameters - ---------- - columns : list of columns to check for duplicates - keep : treat "first", "last", or (False) none of any duplicate - rows as distinct - nulls_equal : Should nulls compare equal - nans_equal: Should nans compare equal - - Returns - ------- - Column of indices - - See Also - -------- - drop_duplicates - """ - if (keep_option := _keep_options.get(keep)) is None: - raise ValueError('keep must be either "first", "last" or False') - - return Column.from_pylibcudf( - pylibcudf.stream_compaction.distinct_indices( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - keep_option, - pylibcudf.types.NullEquality.EQUAL - if nulls_equal else pylibcudf.types.NullEquality.UNEQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL - if nans_equal else pylibcudf.types.NanEquality.UNEQUAL, - ) - ) - - -@acquire_spill_lock() -def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): - """ - Finds number of unique rows in `source_column` - - Parameters - ---------- - source_column : source table checked for unique rows - ignore_nulls : If True nulls are ignored, - else counted as one more distinct value - nan_as_null : If True, NAN is considered NULL, - else counted as one more distinct value - - Returns - ------- - Count of number of unique rows in `source_column` - """ - return pylibcudf.stream_compaction.distinct_count( - source_column.to_pylibcudf(mode="read"), - pylibcudf.types.NullPolicy.EXCLUDE - if ignore_nulls else pylibcudf.types.NullPolicy.INCLUDE, - pylibcudf.types.NanPolicy.NAN_IS_NULL - if nan_as_null else pylibcudf.types.NanPolicy.NAN_IS_VALID, - ) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx deleted file mode 100644 index 06ee07d8e2b..00000000000 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ /dev/null @@ -1,598 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column - -import pylibcudf as plc -from pylibcudf.types cimport DataType - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.types cimport dtype_to_pylibcudf_type - - -def floating_to_string(Column input_col): - plc_column = plc.strings.convert.convert_floats.from_floats( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def string_to_floating(Column input_col, DataType out_type): - plc_column = plc.strings.convert.convert_floats.to_floats( - input_col.to_pylibcudf(mode="read"), - out_type - ) - return Column.from_pylibcudf(plc_column) - - -def dtos(Column input_col): - """ - Converting/Casting input column of type double to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with double values cast to string - """ - - return floating_to_string(input_col) - - -def stod(Column input_col): - """ - Converting/Casting input column of type string to double - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to double - """ - - return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) - - -def ftos(Column input_col): - """ - Converting/Casting input column of type float to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with float values cast to string - """ - - return floating_to_string(input_col) - - -def stof(Column input_col): - """ - Converting/Casting input column of type string to float - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to float - """ - - return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) - - -def integer_to_string(Column input_col): - plc_column = plc.strings.convert.convert_integers.from_integers( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def string_to_integer(Column input_col, DataType out_type): - plc_column = plc.strings.convert.convert_integers.to_integers( - input_col.to_pylibcudf(mode="read"), - out_type - ) - return Column.from_pylibcudf(plc_column) - - -def i8tos(Column input_col): - """ - Converting/Casting input column of type int8 to string column - - Parameters - ---------- - input_col : input column of type int8 - - Returns - ------- - A Column with int8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi8(Column input_col): - """ - Converting/Casting input column of type string to int8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int8 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) - - -def i16tos(Column input_col): - """ - Converting/Casting input column of type int16 to string column - - Parameters - ---------- - input_col : input column of type int16 - - Returns - ------- - A Column with int16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi16(Column input_col): - """ - Converting/Casting input column of type string to int16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int16 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) - - -def itos(Column input_col): - """ - Converting/Casting input column of type int32 to string column - - Parameters - ---------- - input_col : input column of type int32 - - Returns - ------- - A Column with int32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi(Column input_col): - """ - Converting/Casting input column of type string to int32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int32 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) - - -def ltos(Column input_col): - """ - Converting/Casting input column of type int64 to string column - - Parameters - ---------- - input_col : input column of type int64 - - Returns - ------- - A Column with int64 values cast to string - """ - - return integer_to_string(input_col) - - -def stol(Column input_col): - """ - Converting/Casting input column of type string to int64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int64 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) - - -def ui8tos(Column input_col): - """ - Converting/Casting input column of type uint8 to string column - - Parameters - ---------- - input_col : input column of type uint8 - - Returns - ------- - A Column with uint8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui8(Column input_col): - """ - Converting/Casting input column of type string to uint8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint8 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) - - -def ui16tos(Column input_col): - """ - Converting/Casting input column of type uint16 to string column - - Parameters - ---------- - input_col : input column of type uint16 - - Returns - ------- - A Column with uint16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui16(Column input_col): - """ - Converting/Casting input column of type string to uint16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint16 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) - - -def uitos(Column input_col): - """ - Converting/Casting input column of type uint32 to string column - - Parameters - ---------- - input_col : input column of type uint32 - - Returns - ------- - A Column with uint32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui(Column input_col): - """ - Converting/Casting input column of type string to uint32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint32 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) - - -def ultos(Column input_col): - """ - Converting/Casting input column of type uint64 to string column - - Parameters - ---------- - input_col : input column of type uint64 - - Returns - ------- - A Column with uint64 values cast to string - """ - - return integer_to_string(input_col) - - -def stoul(Column input_col): - """ - Converting/Casting input column of type string to uint64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint64 - """ - - return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) - - -def to_booleans(Column input_col): - plc_column = plc.strings.convert.convert_booleans.to_booleans( - input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").c_value, - ) - return Column.from_pylibcudf(plc_column) - - -def from_booleans(Column input_col): - plc_column = plc.strings.convert.convert_booleans.from_booleans( - input_col.to_pylibcudf(mode="read"), - as_device_scalar("True").c_value, - as_device_scalar("False").c_value, - ) - return Column.from_pylibcudf(plc_column) - - -def int2timestamp( - Column input_col, - str format, - Column names): - """ - Converting/Casting input date-time column to string - column with specified format - - Parameters - ---------- - input_col : input column of type timestamp in integer format - format : The string specifying output format - names : The string names to use for weekdays ("%a", "%A") and - months ("%b", "%B") - - Returns - ------- - A Column with date-time represented in string format - - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.from_timestamps( - input_col.to_pylibcudf(mode="read"), - format, - names.to_pylibcudf(mode="read") - ) - ) - - -def timestamp2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to date-time column with specified - timestamp_format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in date-time format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.to_timestamps( - input_col.to_pylibcudf(mode="read"), - dtype, - format - ) - ) - - -def istimestamp(Column input_col, str format): - """ - Check input string column matches the specified timestamp format - - Parameters - ---------- - input_col : input column of type string - - format : format string of timestamp specifiers - - Returns - ------- - A Column of boolean values identifying strings that matched the format. - - """ - plc_column = plc.strings.convert.convert_datetime.is_timestamp( - input_col.to_pylibcudf(mode="read"), - format - ) - return Column.from_pylibcudf(plc_column) - - -def timedelta2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to TimeDelta column with specified - format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in TimeDelta format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.to_durations( - input_col.to_pylibcudf(mode="read"), - dtype, - format - ) - ) - - -def int2timedelta(Column input_col, str format): - """ - Converting/Casting input Timedelta column to string - column with specified format - - Parameters - ---------- - input_col : input column of type Timedelta in integer format - - Returns - ------- - A Column with Timedelta represented in string format - - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.from_durations( - input_col.to_pylibcudf(mode="read"), - format - ) - ) - - -def int2ip(Column input_col): - """ - Converting/Casting integer column to string column in ipv4 format - - Parameters - ---------- - input_col : input integer column - - Returns - ------- - A Column with integer represented in string ipv4 format - - """ - plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( - input_col.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def ip2int(Column input_col): - """ - Converting string ipv4 column to integer column - - Parameters - ---------- - input_col : input string column - - Returns - ------- - A Column with ipv4 represented as integer - - """ - plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( - input_col.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def is_ipv4(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn - where nnn is integer digits in [0,255]. - """ - plc_column = plc.strings.convert.convert_ipv4.is_ipv4( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -def htoi(Column input_col): - """ - Converting input column of type string having hex values - to integer of out_type - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column of integers parsed from hexadecimal string values. - """ - plc_column = plc.strings.convert.convert_integers.hex_to_integers( - input_col.to_pylibcudf(mode="read"), - plc.DataType(plc.TypeId.INT64) - ) - return Column.from_pylibcudf(plc_column) - - -def is_hex(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have hex characters. - """ - plc_column = plc.strings.convert.convert_integers.is_hex( - source_strings.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -def itoh(Column input_col): - """ - Converting input column of type integer to a string - column with hexadecimal character digits. - - Parameters - ---------- - input_col : input column of type integer - - Returns - ------- - A Column of strings with hexadecimal characters. - """ - plc_column = plc.strings.convert.convert_integers.integers_to_hex( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 6cc52d046af..900be721c9a 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -1,22 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column_view -from pylibcudf.libcudf.table.table cimport table, table_view - - -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=*) cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*) cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) -cdef data_from_table_view( - table_view tv, object owner, object column_names, object index_names=*) -cdef table_view table_view_from_columns(columns) except * -cdef table_view table_view_from_table(tbl, ignore_index=*) except* -cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) -cdef columns_from_table_view(table_view tv, object owners) cpdef columns_from_pylibcudf_table(tbl) cpdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index ff032656f80..975c9eb741c 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -1,233 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pyarrow as pa - import cudf -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column, column_view -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from pylibcudf cimport Column as plc_Column -try: - import ujson as json -except ImportError: - import json - -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype - -PARQUET_META_TYPE_MAP = { - str(cudf_dtype): str(pandas_dtype) - for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() -} - -cdef table_view table_view_from_columns(columns) except*: - """Create a cudf::table_view from an iterable of Columns.""" - cdef vector[column_view] column_views - - cdef Column col - for col in columns: - column_views.push_back(col.view()) - - return table_view(column_views) - - -cdef table_view table_view_from_table(tbl, ignore_index=False) except*: - """Create a cudf::table_view from a Table. - - Parameters - ---------- - ignore_index : bool, default False - If True, don't include the index in the columns. - """ - return table_view_from_columns( - tbl._index._columns + tbl._columns - if not ignore_index and tbl._index is not None - else tbl._columns - ) - - -cpdef generate_pandas_metadata(table, index): - col_names = [] - types = [] - index_levels = [] - index_descriptors = [] - columns_to_convert = list(table._columns) - # Columns - for name, col in table._column_labels_and_values: - if cudf.get_option("mode.pandas_compatible"): - # in pandas-compat mode, non-string column names are stringified. - col_names.append(str(name)) - else: - col_names.append(name) - - if isinstance(col.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(col.dtype, ( - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype - )): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if col.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(col.dtype)) - - # Indexes - materialize_index = False - if index is not False: - for level, name in enumerate(table._index.names): - if isinstance(table._index, cudf.MultiIndex): - idx = table.index.get_level_values(level) - else: - idx = table.index - - if isinstance(idx, cudf.RangeIndex): - if index is None: - descr = { - "kind": "range", - "name": table.index.name, - "start": table.index.start, - "stop": table.index.stop, - "step": table.index.step, - } - else: - materialize_index = True - # When `index=True`, RangeIndex needs to be materialized. - materialized_idx = idx._as_int_index() - descr = _index_level_name( - index_name=materialized_idx.name, - level=level, - column_names=col_names - ) - index_levels.append(materialized_idx) - columns_to_convert.append(materialized_idx._values) - col_names.append(descr) - types.append(np_to_pa_dtype(materialized_idx.dtype)) - else: - descr = _index_level_name( - index_name=idx.name, - level=level, - column_names=col_names - ) - columns_to_convert.append(idx._values) - col_names.append(descr) - if isinstance(idx.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(idx.dtype, cudf.ListDtype): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if idx.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(idx.dtype)) - - index_levels.append(idx) - index_descriptors.append(descr) - - df_meta = table.head(0) - if materialize_index: - df_meta.index = df_meta.index._as_int_index() - metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=columns_to_convert, - # It is OKAY to do `.head(0).to_pandas()` because - # this method will extract `.columns` metadata only - df=df_meta.to_pandas(), - column_names=col_names, - index_levels=index_levels, - index_descriptors=index_descriptors, - preserve_index=index, - types=types, - ) - - md_dict = json.loads(metadata[b"pandas"]) - - # correct metadata for list and struct and nullable numeric types - for col_meta in md_dict["columns"]: - if ( - col_meta["name"] in table._column_names - and table._data[col_meta["name"]].nullable - and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP - and col_meta["pandas_type"] != "decimal" - ): - col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[ - col_meta["numpy_type"] - ] - if col_meta["numpy_type"] in ("list", "struct"): - col_meta["numpy_type"] = "object" - - return json.dumps(md_dict) - - -def _index_level_name(index_name, level, column_names): - """ - Return the name of an index level or a default name - if `index_name` is None or is already a column name. - - Parameters - ---------- - index_name : name of an Index object - level : level of the Index object - - Returns - ------- - name : str - """ - if index_name is not None and index_name not in column_names: - return index_name - else: - return f"__index_level_{level}__" - - -cdef columns_from_unique_ptr( - unique_ptr[table] c_tbl -): - """Convert a libcudf table into list of columns. - - Parameters - ---------- - c_tbl : unique_ptr[cudf::table] - The libcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) - cdef vector[unique_ptr[column]].iterator it = c_columns.begin() - - cdef size_t i - - return [ - Column.from_pylibcudf( - plc_Column.from_libcudf(move(dereference(it+i))) - ) for i in range(c_columns.size()) - ] cpdef columns_from_pylibcudf_table(tbl): @@ -281,8 +55,7 @@ cpdef _data_from_columns(columns, column_names, index_names=None): # the data while actually constructing the Index object here (instead # of just returning a dict for that as well). As we clean up the # Frame factories we may want to look for a less dissonant approach - # that does not impose performance penalties. The same applies to - # data_from_table_view below. + # that does not impose performance penalties. cudf.core.index._index_from_data( { name: columns[i] @@ -300,16 +73,6 @@ cpdef _data_from_columns(columns, column_names, index_names=None): return data, index -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=None -): - return _data_from_columns( - columns_from_unique_ptr(move(c_tbl)), - column_names, - index_names - ) - - cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None): return _data_from_columns( columns_from_pylibcudf_table(tbl), @@ -329,73 +92,3 @@ cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None) column_names=column_names, index_names=index_names ) - -cdef columns_from_table_view( - table_view tv, - object owners, -): - """ - Given a ``cudf::table_view``, constructs a list of columns from it, - along with referencing an owner Python object that owns the memory - lifetime. owner must be either None or a list of column. If owner - is a list of columns, the owner of the `i`th ``cudf::column_view`` - in the table view is ``owners[i]``. For more about memory ownership, - see ``Column.from_column_view``. - """ - - return [ - Column.from_column_view( - tv.column(i), owners[i] if isinstance(owners, list) else None - ) for i in range(tv.num_columns()) - ] - -cdef data_from_table_view( - table_view tv, - object owner, - object column_names, - object index_names=None -): - """ - Given a ``cudf::table_view``, constructs a Frame from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a Frame we reach inside of it and - reach inside of each ``cudf.Column`` to make the owner of each newly - created ``Buffer`` underneath the ``cudf.Column`` objects of the - created Frame the respective ``Buffer`` from the relevant - ``cudf.Column`` of the ``owner`` Frame - """ - cdef size_type column_idx = 0 - table_owner = isinstance(owner, cudf.core.frame.Frame) - - # First construct the index, if any - index = None - if index_names is not None: - index_columns = [] - for _ in index_names: - column_owner = owner - if table_owner: - column_owner = owner._index._columns[column_idx] - index_columns.append( - Column.from_column_view( - tv.column(column_idx), - column_owner - ) - ) - column_idx += 1 - index = cudf.core.index._index_from_data( - dict(zip(index_names, index_columns))) - - # Construct the data dict - cdef size_type source_column_idx = 0 - data_columns = [] - for _ in column_names: - column_owner = owner - if table_owner: - column_owner = owner._columns[source_column_idx] - data_columns.append( - Column.from_column_view(tv.column(column_idx), column_owner) - ) - column_idx += 1 - source_column_idx += 1 - - return dict(zip(column_names, data_columns)), index diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ed48fbf5c5a..c2f3c782d10 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -10,17 +10,18 @@ from typing_extensions import Self import cudf -from cudf._lib.copying import _gather_map_is_valid, gather -from cudf._lib.stream_compaction import ( +from cudf._lib.types import size_type_dtype +from cudf.api.extensions import no_default +from cudf.api.types import is_integer, is_list_like, is_scalar +from cudf.core._internals import copying +from cudf.core._internals.stream_compaction import ( apply_boolean_mask, drop_duplicates, drop_nulls, ) -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_scalar from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column +from cudf.core.copy_types import GatherMap from cudf.errors import MixedTypeError from cudf.utils import ioutils from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype @@ -413,7 +414,7 @@ def hasnans(self): raise NotImplementedError @property - def nlevels(self): + def nlevels(self) -> int: """ Number of levels. """ @@ -1943,7 +1944,6 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), @@ -2032,7 +2032,6 @@ def dropna(self, how="any"): drop_nulls( data_columns, how=how, - keys=range(len(data_columns)), ), self._column_names, ) @@ -2050,13 +2049,9 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): if gather_map.dtype.kind not in "iu": gather_map = gather_map.astype(size_type_dtype) - if not _gather_map_is_valid( - gather_map, len(self), check_bounds, nullify - ): - raise IndexError("Gather map index is out of bounds.") - + GatherMap(gather_map, len(self), nullify=not check_bounds or nullify) return self._from_columns_like_self( - gather(list(self._columns), gather_map, nullify=nullify), + copying.gather(self._columns, gather_map, nullify=nullify), self._column_names, ) diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py new file mode 100644 index 00000000000..34c1850cb72 --- /dev/null +++ b/python/cudf/cudf/core/_internals/copying.py @@ -0,0 +1,96 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +import cudf +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from collections.abc import Iterable + + from cudf.core.column import ColumnBase + from cudf.core.column.numerical import NumericalColumn + + +@acquire_spill_lock() +def gather( + columns: Iterable[ColumnBase], + gather_map: NumericalColumn, + nullify: bool = False, +) -> list[ColumnBase]: + plc_tbl = plc.copying.gather( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + gather_map.to_pylibcudf(mode="read"), + plc.copying.OutOfBoundsPolicy.NULLIFY + if nullify + else plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + return [ + cudf._lib.column.Column.from_pylibcudf(col) + for col in plc_tbl.columns() + ] + + +@acquire_spill_lock() +def scatter( + sources: list[ColumnBase | cudf.Scalar], + scatter_map: NumericalColumn, + target_columns: list[ColumnBase], + bounds_check: bool = True, +): + """ + Scattering source into target as per the scatter map. + `source` can be a list of scalars, or a list of columns. The number of + items in `sources` must equal the number of `target_columns` to scatter. + """ + # TODO: Only single column scatter is used, we should explore multi-column + # scatter for frames for performance increase. + + if len(sources) != len(target_columns): + raise ValueError("Mismatched number of source and target columns.") + + if len(sources) == 0: + return [] + + if bounds_check: + n_rows = len(target_columns[0]) + if not ( + (scatter_map >= -n_rows).all() and (scatter_map < n_rows).all() + ): + raise IndexError( + f"index out of bounds for column of size {n_rows}" + ) + + plc_tbl = plc.copying.scatter( + plc.Table([col.to_pylibcudf(mode="read") for col in sources]) # type: ignore[union-attr] + if isinstance(sources[0], cudf._lib.column.Column) + else [slr.device_value.c_value for slr in sources], # type: ignore[union-attr] + scatter_map.to_pylibcudf(mode="read"), + plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]), + ) + + return [ + cudf._lib.column.Column.from_pylibcudf(col) + for col in plc_tbl.columns() + ] + + +@acquire_spill_lock() +def columns_split( + input_columns: Iterable[ColumnBase], splits: list[int] +) -> list[list[ColumnBase]]: + return [ + [ + cudf._lib.column.Column.from_pylibcudf(col) + for col in plc_tbl.columns() + ] + for plc_tbl in plc.copying.split( + plc.Table( + [col.to_pylibcudf(mode="read") for col in input_columns] + ), + splits, + ) + ] diff --git a/python/cudf/cudf/core/_internals/stream_compaction.py b/python/cudf/cudf/core/_internals/stream_compaction.py new file mode 100644 index 00000000000..4ccc26c2a1c --- /dev/null +++ b/python/cudf/cudf/core/_internals/stream_compaction.py @@ -0,0 +1,121 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def drop_nulls( + columns: list[ColumnBase], + how: Literal["any", "all"] = "any", + keys: list[int] | None = None, + thresh: int | None = None, +) -> list[ColumnBase]: + """ + Drops null rows from cols depending on key columns. + + Parameters + ---------- + columns : list of columns + how : "any" or "all". If thresh is None, drops rows of cols that have any + nulls or all nulls (respectively) in subset (default: "any") + keys : List of column indices. If set, then these columns are checked for + nulls rather than all of columns (optional) + thresh : Minimum number of non-nulls required to keep a row (optional) + + Returns + ------- + columns with null rows dropped + """ + if how not in {"any", "all"}: + raise ValueError("how must be 'any' or 'all'") + + keys = keys if keys is not None else list(range(len(columns))) + + # Note: If how == "all" and thresh is specified this prioritizes thresh + if thresh is not None: + keep_threshold = thresh + elif how == "all": + keep_threshold = 1 + else: + keep_threshold = len(keys) + + plc_table = plc.stream_compaction.drop_nulls( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + keys, + keep_threshold, + ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()] + + +@acquire_spill_lock() +def apply_boolean_mask( + columns: list[ColumnBase], boolean_mask: ColumnBase +) -> list[ColumnBase]: + """ + Drops the rows which correspond to False in boolean_mask. + + Parameters + ---------- + columns : list of columns whose rows are dropped as per boolean_mask + boolean_mask : a boolean column of same size as source_table + + Returns + ------- + columns obtained from applying mask + """ + plc_table = plc.stream_compaction.apply_boolean_mask( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + boolean_mask.to_pylibcudf(mode="read"), + ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()] + + +@acquire_spill_lock() +def drop_duplicates( + columns: list[ColumnBase], + keys: list[int] | None = None, + keep: Literal["first", "last", False] = "first", + nulls_are_equal: bool = True, +) -> list[ColumnBase]: + """ + Drops rows in source_table as per duplicate rows in keys. + + Parameters + ---------- + columns : List of columns + keys : List of column indices. If set, then these columns are checked for + duplicates rather than all of columns (optional) + keep : keep 'first' or 'last' or none of the duplicate rows + nulls_are_equal : if True, nulls are treated equal else not. + + Returns + ------- + columns with duplicate dropped + """ + _keep_options = { + "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + } + if (keep_option := _keep_options.get(keep)) is None: + raise ValueError('keep must be either "first", "last" or False') + + plc_table = plc.stream_compaction.stable_distinct( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + keys if keys is not None else list(range(len(columns))), + keep_option, + plc.types.NullEquality.EQUAL + if nulls_are_equal + else plc.types.NullEquality.UNEQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a0cf38c6f51..d9b54008e85 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1095,17 +1095,22 @@ def as_categorical_column(self, dtype: Dtype) -> Self: raise ValueError("dtype must be CategoricalDtype") if not isinstance(self.categories, type(dtype.categories._column)): - # If both categories are of different Column types, - # return a column full of Nulls. - codes = cast( - cudf.core.column.numerical.NumericalColumn, - column.as_column( - _DEFAULT_CATEGORICAL_VALUE, - length=self.size, - dtype=self.codes.dtype, - ), - ) - codes = as_unsigned_codes(len(dtype.categories), codes) + if isinstance( + self.categories.dtype, cudf.StructDtype + ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype): + codes = self.codes + else: + # Otherwise if both categories are of different Column types, + # return a column full of nulls. + codes = cast( + cudf.core.column.numerical.NumericalColumn, + column.as_column( + _DEFAULT_CATEGORICAL_VALUE, + length=self.size, + dtype=self.codes.dtype, + ), + ) + codes = as_unsigned_codes(len(dtype.categories), codes) return type(self)( data=self.data, # type: ignore[arg-type] size=self.size, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cc07af0f669..cccafaeba88 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -26,12 +26,6 @@ from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.stream_compaction import ( - apply_boolean_mask, - distinct_count as cpp_distinct_count, - drop_duplicates, - drop_nulls, -) from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -42,7 +36,12 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import aggregation, sorting, unary +from cudf.core._internals import aggregation, copying, sorting, unary +from cudf.core._internals.stream_compaction import ( + apply_boolean_mask, + drop_duplicates, + drop_nulls, +) from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -51,6 +50,7 @@ as_buffer, cuda_array_interface_wrapper, ) +from cudf.core.copy_types import GatherMap from cudf.core.dtypes import ( CategoricalDtype, DecimalDtype, @@ -77,6 +77,7 @@ import builtins from cudf._typing import ColumnLike, Dtype, ScalarLike + from cudf.core.column.numerical import NumericalColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -274,10 +275,11 @@ def any(self, skipna: bool = True) -> bool: def dropna(self) -> Self: if self.has_nulls(): - return drop_nulls([self])[0]._with_type_metadata(self.dtype) + return drop_nulls([self])[0]._with_type_metadata(self.dtype) # type: ignore[return-value] else: return self.copy() + @acquire_spill_lock() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -294,9 +296,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) + return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -333,26 +333,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( - { - "None": pa.chunked_array( - [chunk.indices for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.indices for chunk in data.column(0).chunks], type=array.type.index_type, ) - } + ], + [None], ) dictionaries_table = pa.table( - { - "None": pa.chunked_array( - [chunk.dictionary for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.dictionary for chunk in data.column(0).chunks], type=array.type.value_type, ) - } + ], + [None], ) - - codes = libcudf.interop.from_arrow(indices_table)[0] - categories = libcudf.interop.from_arrow(dictionaries_table)[0] + with acquire_spill_lock(): + codes = cls.from_pylibcudf( + plc.interop.from_arrow(indices_table).columns()[0] + ) + categories = cls.from_pylibcudf( + plc.interop.from_arrow(dictionaries_table).columns()[0] + ) codes = cudf.core.column.categorical.as_unsigned_codes( - len(categories), codes + len(categories), + codes, # type: ignore[arg-type] ) return cudf.core.column.CategoricalColumn( data=None, @@ -363,10 +370,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: mask=codes.base_mask, children=(codes,), ) - - result = libcudf.interop.from_arrow(data)[0] - - return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + else: + result = cls.from_pylibcudf( + plc.interop.from_arrow(data).columns()[0] + ) + # TODO: cudf_dtype_from_pa_type may be less necessary for some types + return result._with_type_metadata( + cudf_dtype_from_pa_type(array.type) + ) @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: @@ -431,8 +442,16 @@ def _fill( ) return self - def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: - return libcudf.copying.shift(self, offset, fill_value) + @acquire_spill_lock() + def shift(self, offset: int, fill_value: ScalarLike) -> Self: + if not isinstance(fill_value, cudf.Scalar): + fill_value = cudf.Scalar(fill_value, dtype=self.dtype) + plc_col = plc.copying.shift( + self.to_pylibcudf(mode="read"), + offset, + fill_value.device_value.c_value, + ) + return type(self).from_pylibcudf(plc_col) # type: ignore[return-value] @property def nullmask(self) -> Buffer: @@ -460,8 +479,11 @@ def copy(self, deep: bool = True) -> Self: them. """ if deep: - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) + with acquire_spill_lock(): + result = type(self).from_pylibcudf( + self.to_pylibcudf(mode="read").copy() + ) + return result._with_type_metadata(self.dtype) # type: ignore[return-value] else: return cast( Self, @@ -542,7 +564,15 @@ def element_indexing(self, index: int): idx = len(self) + idx if idx > len(self) - 1 or idx < 0: raise IndexError("single positional indexer is out-of-bounds") - return libcudf.copying.get_element(self, idx).value + with acquire_spill_lock(): + dscalar = libcudf.scalar.DeviceScalar.from_pylibcudf( + plc.copying.get_element( + self.to_pylibcudf(mode="read"), + idx, + ), + dtype=self.dtype, + ) + return dscalar.value def slice(self, start: int, stop: int, stride: int | None = None) -> Self: stride = 1 if stride is None else stride @@ -554,9 +584,15 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self: return cast(Self, column_empty(0, self.dtype)) # compute mask slice if stride == 1: - return libcudf.copying.column_slice(self, [start, stop])[ - 0 - ]._with_type_metadata(self.dtype) + with acquire_spill_lock(): + result = [ + type(self).from_pylibcudf(col) + for col in plc.copying.slice( + self.to_pylibcudf(mode="read"), + [start, stop], + ) + ] + return result[0]._with_type_metadata(self.dtype) # type: ignore[return-value] else: # Need to create a gather map for given slice with stride gather_map = as_column( @@ -625,9 +661,16 @@ def _scatter_by_slice( if isinstance(value, cudf.core.scalar.Scalar): return self._fill(value, start, stop, inplace=True) else: - return libcudf.copying.copy_range( - value, self, 0, num_keys, start, stop, False - ) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.copying.copy_range( + value.to_pylibcudf(mode="read"), + self.to_pylibcudf(mode="read"), + 0, + num_keys, + start, + ) + ) # step != 1, create a scatter map with arange scatter_map = cast( @@ -671,11 +714,21 @@ def _scatter_by_column( self._check_scatter_key_length(num_keys, value) if key.dtype.kind == "b": - return libcudf.copying.boolean_mask_scatter([value], [self], key)[ - 0 - ]._with_type_metadata(self.dtype) + with acquire_spill_lock(): + plc_table = plc.copying.boolean_mask_scatter( + plc.Table([value.to_pylibcudf(mode="read")]) + if isinstance(value, Column) + else [value.device_value.c_value], + plc.Table([self.to_pylibcudf(mode="read")]), + key.to_pylibcudf(mode="read"), + ) + return ( + type(self) # type: ignore[return-value] + .from_pylibcudf(plc_table.columns()[0]) + ._with_type_metadata(self.dtype) + ) else: - return libcudf.copying.scatter([value], key, [self])[ + return copying.scatter([value], key, [self])[ 0 ]._with_type_metadata(self.dtype) @@ -805,7 +858,7 @@ def indices_of( else: value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) - return apply_boolean_mask( + return apply_boolean_mask( # type: ignore[return-value] [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] @@ -887,14 +940,9 @@ def take( # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: indices = indices.astype(libcudf.types.size_type_dtype) - if not libcudf.copying._gather_map_is_valid( - indices, len(self), check_bounds, nullify - ): - raise IndexError("Gather map index is out of bounds.") - - return libcudf.copying.gather([self], indices, nullify=nullify)[ - 0 - ]._with_type_metadata(self.dtype) + GatherMap(indices, len(self), nullify=not check_bounds or nullify) + gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] + return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] def isin(self, values: Sequence) -> ColumnBase: """Check whether values are contained in the Column. @@ -1045,9 +1093,15 @@ def distinct_count(self, dropna: bool = True) -> int: try: return self._distinct_count[dropna] except KeyError: - self._distinct_count[dropna] = cpp_distinct_count( - self, ignore_nulls=dropna - ) + with acquire_spill_lock(): + result = plc.stream_compaction.distinct_count( + self.to_pylibcudf(mode="read"), + plc.types.NullPolicy.EXCLUDE + if dropna + else plc.types.NullPolicy.INCLUDE, + plc.types.NanPolicy.NAN_IS_VALID, + ) + self._distinct_count[dropna] = result return self._distinct_count[dropna] def can_cast_safely(self, to_dtype: Dtype) -> bool: @@ -1276,7 +1330,7 @@ def unique(self) -> Self: if self.is_unique: return self.copy() else: - return drop_duplicates([self], keep="first")[ + return drop_duplicates([self], keep="first")[ # type: ignore[return-value] 0 ]._with_type_metadata(self.dtype) @@ -1507,20 +1561,33 @@ def _return_sentinel_column(): left_gather_map = type(self).from_pylibcudf(left_rows) right_gather_map = type(self).from_pylibcudf(right_rows) - codes = libcudf.copying.gather( - [as_column(range(len(cats)), dtype=dtype)], - right_gather_map, - nullify=True, + codes = as_column(range(len(cats)), dtype=dtype).take( + right_gather_map, nullify=True ) del right_gather_map del right_rows # reorder `codes` so that its values correspond to the # values of `self`: (codes,) = sorting.sort_by_key( - codes, [left_gather_map], [True], ["last"], stable=True + [codes], [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) + @acquire_spill_lock() + def copy_if_else( + self, other: Self | cudf.Scalar, boolean_mask: NumericalColumn + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.copying.copy_if_else( + self.to_pylibcudf(mode="read"), + other.device_value.c_value + if isinstance(other, cudf.Scalar) + else other.to_pylibcudf(mode="read"), + boolean_mask.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() def one_hot_encode( self, categories: ColumnBase ) -> abc.Generator[ColumnBase]: @@ -2024,18 +2091,26 @@ def as_column( if isinstance(arbitrary.dtype, pd.DatetimeTZDtype): new_tz = get_compatible_timezone(arbitrary.dtype) arbitrary = arbitrary.astype(new_tz) - if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ): - new_tz = get_compatible_timezone( - arbitrary.dtype.categories.dtype - ) - new_cats = arbitrary.dtype.categories.astype(new_tz) - new_dtype = pd.CategoricalDtype( - categories=new_cats, ordered=arbitrary.dtype.ordered - ) - arbitrary = arbitrary.astype(new_dtype) - + if isinstance(arbitrary.dtype, pd.CategoricalDtype): + if isinstance( + arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype + ): + new_tz = get_compatible_timezone( + arbitrary.dtype.categories.dtype + ) + new_cats = arbitrary.dtype.categories.astype(new_tz) + new_dtype = pd.CategoricalDtype( + categories=new_cats, ordered=arbitrary.dtype.ordered + ) + arbitrary = arbitrary.astype(new_dtype) + elif ( + isinstance( + arbitrary.dtype.categories.dtype, pd.IntervalDtype + ) + and dtype is None + ): + # Conversion to arrow converts IntervalDtype to StructDtype + dtype = cudf.CategoricalDtype.from_pandas(arbitrary.dtype) return as_column( pa.array(arbitrary, from_pandas=True), nan_as_null=nan_as_null, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 81b82040b8d..b6a4122ebb9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -19,7 +19,6 @@ import cudf import cudf.core.column.column as column -import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import binaryop, unary @@ -354,9 +353,7 @@ def is_year_end(self) -> ColumnBase: leap = day_of_year == cudf.Scalar(366) non_leap = day_of_year == cudf.Scalar(365) - return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna( - False - ) + return leap.copy_if_else(non_leap, leap_dates).fillna(False) @property def is_leap_year(self) -> ColumnBase: @@ -604,9 +601,14 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: names = as_column(_DATETIME_NAMES) else: names = column.column_empty(0, dtype="object") - return string._datetime_to_str_typecast_functions[self.dtype]( - self, format, names - ) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_datetime.from_timestamps( + self.to_pylibcudf(mode="read"), + format, + names.to_pylibcudf(mode="read"), + ) + ) def as_string_column(self) -> cudf.core.column.StringColumn: format = _dtype_to_format_conversion.get( @@ -1014,7 +1016,7 @@ def to_pandas( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - def to_arrow(self): + def to_arrow(self) -> pa.Array: return pa.compute.assume_timezone( self._local_time.to_arrow(), str(self.dtype.tz) ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9e6a73f1a9c..09941665ba2 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_32 = np.array(self.base_data.memoryview()).view("int32") + def to_arrow(self) -> pa.Array: + data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") # use striding to set the first 32 bits of each 128-bit chunk: @@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array): result.dtype.precision = data.type.precision return result - def to_arrow(self): + def to_arrow(self) -> pa.Array: return super().to_arrow().cast(self.dtype.to_arrow()) def _with_type_metadata( @@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_64 = np.array(self.base_data.memoryview()).view("int64") + def to_arrow(self) -> pa.Array: + data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") # use striding to set the first 64 bits of each 128-bit chunk: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ba98e28f6a2..3d9440cdf21 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn: """ return cast(NumericalColumn, self.children[0]) - def to_arrow(self): + def to_arrow(self) -> pa.Array: offsets = self.offsets.to_arrow() elements = ( pa.nulls(len(self.elements)) @@ -160,7 +160,7 @@ def to_arrow(self): pa_type = pa.list_(elements.type) if self.nullable: - nbuf = pa.py_buffer(self.mask.memoryview()) + nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr] buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f099cef3331..4405e153b0c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -14,8 +14,6 @@ import cudf import cudf.core.column.column as column -import cudf.core.column.string as string -from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop, unary from cudf.core.buffer import acquire_spill_lock, as_buffer @@ -366,22 +364,42 @@ def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar: else: return NotImplemented - def int2ip(self) -> "cudf.core.column.StringColumn": - if self.dtype != cudf.dtype("uint32"): + @acquire_spill_lock() + def int2ip(self) -> cudf.core.column.StringColumn: + if self.dtype != np.dtype(np.uint32): raise TypeError("Only uint32 type can be converted to ip") - - return libcudf.string_casting.int2ip(self) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def as_string_column(self) -> cudf.core.column.StringColumn: - if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self) - else: + if len(self) == 0: return cast( cudf.core.column.StringColumn, column.column_empty(0, dtype="object"), ) + elif self.dtype.kind == "b": + conv_func = functools.partial( + plc.strings.convert.convert_booleans.from_booleans, + true_string=cudf.Scalar( + "True", dtype="str" + ).device_value.c_value, + false_string=cudf.Scalar( + "False", dtype="str" + ).device_value.c_value, + ) + elif self.dtype.kind in {"i", "u"}: + conv_func = plc.strings.convert.convert_integers.from_integers + elif self.dtype.kind == "f": + conv_func = plc.strings.convert.convert_floats.from_floats + else: + raise ValueError(f"No string conversion from type {self.dtype}") + + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + conv_func(self.to_pylibcudf(mode="read")) + ) def as_datetime_column( self, dtype: Dtype diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d76caa5c3b8..fcdcb789f23 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,9 +20,8 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype +from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock @@ -49,62 +48,7 @@ from cudf.core.column.numerical import NumericalColumn -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - with acquire_spill_lock(): - plc_column = plc.strings.attributes.count_characters( - column.to_pylibcudf(mode="read") - ) - result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) - - -_str_to_numeric_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.stoi8, - cudf.api.types.dtype("int16"): str_cast.stoi16, - cudf.api.types.dtype("int32"): str_cast.stoi, - cudf.api.types.dtype("int64"): str_cast.stol, - cudf.api.types.dtype("uint8"): str_cast.stoui8, - cudf.api.types.dtype("uint16"): str_cast.stoui16, - cudf.api.types.dtype("uint32"): str_cast.stoui, - cudf.api.types.dtype("uint64"): str_cast.stoul, - cudf.api.types.dtype("float32"): str_cast.stof, - cudf.api.types.dtype("float64"): str_cast.stod, - cudf.api.types.dtype("bool"): str_to_boolean, -} - -_numeric_to_str_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.i8tos, - cudf.api.types.dtype("int16"): str_cast.i16tos, - cudf.api.types.dtype("int32"): str_cast.itos, - cudf.api.types.dtype("int64"): str_cast.ltos, - cudf.api.types.dtype("uint8"): str_cast.ui8tos, - cudf.api.types.dtype("uint16"): str_cast.ui16tos, - cudf.api.types.dtype("uint32"): str_cast.uitos, - cudf.api.types.dtype("uint64"): str_cast.ultos, - cudf.api.types.dtype("float32"): str_cast.ftos, - cudf.api.types.dtype("float64"): str_cast.dtos, - cudf.api.types.dtype("bool"): str_cast.from_booleans, -} - -_datetime_to_str_typecast_functions = { - # TODO: support Date32 UNIX days - # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp, -} - -_timedelta_to_str_typecast_functions = { - cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta, -} - - -def _is_supported_regex_flags(flags): +def _is_supported_regex_flags(flags: int) -> bool: return flags == 0 or ( (flags & (re.MULTILINE | re.DOTALL) != 0) and (flags & ~(re.MULTILINE | re.DOTALL) == 0) @@ -155,10 +99,7 @@ def htoi(self) -> SeriesOrIndex: 3 51966 dtype: int64 """ - - out = str_cast.htoi(self._column) - - return self._return_or_inplace(out, inplace=False) + return self._return_or_inplace(self._column.hex_to_integers()) hex_to_int = htoi @@ -188,10 +129,7 @@ def ip2int(self) -> SeriesOrIndex: 2 0 dtype: int64 """ - - out = str_cast.ip2int(self._column) - - return self._return_or_inplace(out, inplace=False) + return self._return_or_inplace(self._column.ipv4_to_integers()) ip_to_int = ip2int @@ -1380,7 +1318,7 @@ def ishex(self) -> SeriesOrIndex: 4 True dtype: bool """ - return self._return_or_inplace(str_cast.is_hex(self._column)) + return self._return_or_inplace(self._column.is_hex()) def istimestamp(self, format: str) -> SeriesOrIndex: """ @@ -1404,9 +1342,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace( - str_cast.istimestamp(self._column, format) - ) + return self._return_or_inplace(self._column.is_timestamp(format)) def isfloat(self) -> SeriesOrIndex: r""" @@ -1957,7 +1893,7 @@ def isipv4(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(str_cast.is_ipv4(self._column)) + return self._return_or_inplace(self._column.is_ipv4()) def lower(self) -> SeriesOrIndex: """ @@ -4125,9 +4061,7 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: ends_column = self.endswith(suffix)._column # type: ignore[union-attr] removed_column = self.slice(0, -len(suffix), None)._column # type: ignore[union-attr] - result = cudf._lib.copying.copy_if_else( - removed_column, self._column, ends_column - ) + result = removed_column.copy_if_else(self._column, ends_column) return self._return_or_inplace(result) def removeprefix(self, prefix: str) -> SeriesOrIndex: @@ -4165,9 +4099,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: return self._return_or_inplace(self._column) starts_column = self.startswith(prefix)._column # type: ignore[union-attr] removed_column = self.slice(len(prefix), None, None)._column # type: ignore[union-attr] - result = cudf._lib.copying.copy_if_else( - removed_column, self._column, starts_column - ) + result = removed_column.copy_if_else(self._column, starts_column) return self._return_or_inplace(result) def _find( @@ -5826,26 +5758,38 @@ def __contains__(self, item: ScalarLike) -> bool: other = [item] if is_scalar(item) else item return self.contains(column.as_column(other, dtype=self.dtype)).any() - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": + def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: out_dtype = cudf.api.types.dtype(dtype) - string_col = self - if out_dtype.kind in {"i", "u"}: - if not string_col.is_integer().all(): + if out_dtype.kind == "b": + with acquire_spill_lock(): + plc_column = plc.strings.attributes.count_characters( + self.to_pylibcudf(mode="read") + ) + result = Column.from_pylibcudf(plc_column) + return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + elif out_dtype.kind in {"i", "u"}: + if not self.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) + cast_func = plc.strings.convert.convert_integers.to_integers elif out_dtype.kind == "f": - if not string_col.is_float().all(): + if not self.is_float().all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." ) - - result_col = _str_to_numeric_typecast_functions[out_dtype](string_col) - return result_col + cast_func = plc.strings.convert.convert_floats.to_floats + else: + raise ValueError( + f"dtype must be a numerical type, not {out_dtype}" + ) + plc_dtype = dtype_to_pylibcudf_type(out_dtype) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + cast_func(self.to_pylibcudf(mode="read"), plc_dtype) + ) def strptime( self, dtype: Dtype, format: str @@ -5880,23 +5824,27 @@ def strptime( raise NotImplementedError( "Cannot parse date-like strings with different formats" ) - valid_ts = str_cast.istimestamp(self, format) + valid_ts = self.is_timestamp(format) valid = valid_ts | is_nat if not valid.all(): raise ValueError(f"Column contains invalid data for {format=}") - casting_func = str_cast.timestamp2int + casting_func = plc.strings.convert.convert_datetime.to_timestamps add_back_nat = is_nat.any() elif dtype.kind == "m": # type: ignore[union-attr] - casting_func = str_cast.timedelta2int + casting_func = plc.strings.convert.convert_durations.to_durations add_back_nat = False - result_col = casting_func(self, dtype, format) + with acquire_spill_lock(): + plc_dtype = dtype_to_pylibcudf_type(dtype) + result_col = type(self).from_pylibcudf( + casting_func(self.to_pylibcudf(mode="read"), plc_dtype, format) + ) if add_back_nat: result_col[is_nat] = None - return result_col + return result_col # type: ignore[return-value] def as_datetime_column( self, dtype: Dtype @@ -6398,15 +6346,15 @@ def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: ) ) + @acquire_spill_lock() def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: """ Helper function for methods that modify characters e.g. to_lower """ - with acquire_spill_lock(): - plc_column = method(self.to_pylibcudf(mode="read")) - return cast(Self, Column.from_pylibcudf(plc_column)) + plc_column = method(self.to_pylibcudf(mode="read")) + return cast(Self, Column.from_pylibcudf(plc_column)) def to_lower(self) -> Self: return self._modify_characters(plc.strings.case.to_lower) @@ -6435,6 +6383,46 @@ def replace_multiple(self, pattern: Self, replacements: Self) -> Self: ) return cast(Self, Column.from_pylibcudf(plc_result)) + @acquire_spill_lock() + def is_hex(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_integers.is_hex( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def hex_to_integers(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_integers.hex_to_integers( + self.to_pylibcudf(mode="read"), plc.DataType(plc.TypeId.INT64) + ) + ) + + @acquire_spill_lock() + def is_ipv4(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_ipv4.is_ipv4( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def ipv4_to_integers(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_ipv4.ipv4_to_integers( + self.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def is_timestamp(self, format: str) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_datetime.is_timestamp( + self.to_pylibcudf(mode="read"), format + ) + ) + @acquire_spill_lock() def _split_record_re( self, diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 8b1515acae2..417fa99dac0 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -10,9 +10,10 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.core.column.column as column -import cudf.core.column.string as string from cudf.api.types import is_scalar from cudf.core._internals import binaryop, unary from cudf.core.buffer import Buffer, acquire_spill_lock @@ -297,9 +298,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: column.column_empty(0, dtype="object"), ) else: - return string._timedelta_to_str_typecast_functions[self.dtype]( - self, format=format - ) + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.strings.convert.convert_durations.from_durations( + self.to_pylibcudf(mode="read"), format + ) + ) def as_string_column(self) -> cudf.core.column.StringColumn: return self.strftime("%D days %H:%M:%S") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 89649a7b750..3334b57ce1b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1802,13 +1802,37 @@ def _concat( ) for table in tables ] - - concatted = libcudf.utils.data_from_pylibcudf_table( - plc.concatenate.concatenate(plc_tables), - column_names=column_names, - index_names=index_names, - ) - out = cls._from_data(*concatted) + plc_result = plc.concatenate.concatenate(plc_tables) + if ignore: + index = None + data = { + col_name: ColumnBase.from_pylibcudf(col) + for col_name, col in zip( + column_names, plc_result.columns(), strict=True + ) + } + else: + result_columns = [ + ColumnBase.from_pylibcudf(col) + for col in plc_result.columns() + ] + index = _index_from_data( + dict( + zip( + index_names, + result_columns[: len(index_names)], + strict=True, + ) + ) + ) + data = dict( + zip( + column_names, + result_columns[len(index_names) :], + strict=True, + ) + ) + out = cls._from_data(data=data, index=index) # If ignore_index is True, all input frames are empty, and at # least one input frame has an index, assign a new RangeIndex @@ -3172,10 +3196,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) if cond_col := cond._data.get(name): - result = cudf._lib.copying.copy_if_else( - source_col, other_col, cond_col - ) - + result = source_col.copy_if_else(other_col, cond_col) out.append(result._with_type_metadata(col.dtype)) else: out_mask = as_buffer( @@ -7857,7 +7878,8 @@ def interleave_columns(self): return self._constructor_sliced._from_column(result_col) @acquire_spill_lock() - def _compute_columns(self, expr: str) -> ColumnBase: + def _compute_column(self, expr: str) -> ColumnBase: + """Helper function for eval""" plc_column = plc.transform.compute_column( plc.Table( [col.to_pylibcudf(mode="read") for col in self._columns] @@ -7993,7 +8015,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): raise ValueError( "Cannot operate inplace if there is no assignment" ) - return Series._from_column(self._compute_columns(statements[0])) + return Series._from_column(self._compute_column(statements[0])) targets = [] exprs = [] @@ -8011,7 +8033,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): ret = self if inplace else self.copy(deep=False) for name, expr in zip(targets, exprs): - ret._data[name] = self._compute_columns(expr) + ret._data[name] = self._compute_column(expr) if not inplace: return ret diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 4f40ba0bd92..9aadbf8f47a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -22,7 +22,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 -from cudf.core._internals import sorting +from cudf.core._internals import copying, sorting from cudf.core._internals.search import search_sorted from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock @@ -946,16 +946,17 @@ def from_arrow(cls, data: pa.Table) -> Self: if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - indices_columns = libcudf.interop.from_arrow(dict_indices_table) + plc_indices = plc.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in zip( - dict_indices_table.column_names, indices_columns + for name, plc_codes in zip( + dict_indices_table.column_names, plc_indices.columns() ): + codes = libcudf.column.Column.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] codes = as_unsigned_codes(len(categories), codes) cudf_category_frame[name] = CategoricalColumn( @@ -971,9 +972,9 @@ def from_arrow(cls, data: pa.Table) -> Self: # Handle non-dict arrays cudf_non_category_frame = { - name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) + name: libcudf.column.Column.from_pylibcudf(plc_col) + for name, plc_col in zip( + data.column_names, plc.interop.from_arrow(data).columns() ) } @@ -1032,7 +1033,7 @@ def from_arrow(cls, data: pa.Table) -> Self: return cls._from_data({name: result[name] for name in column_names}) @_performance_tracking - def to_arrow(self): + def to_arrow(self) -> pa.Table: """ Convert to arrow Table @@ -1058,19 +1059,6 @@ def to_arrow(self): } ) - @_performance_tracking - def _positions_from_column_names(self, column_names) -> list[int]: - """Map each column name into their positions in the frame. - - The order of indices returned corresponds to the column order in this - Frame. - """ - return [ - i - for i, name in enumerate(self._column_names) - if name in set(column_names) - ] - @_performance_tracking def _copy_type_metadata(self: Self, other: Self) -> Self: """ @@ -1485,18 +1473,13 @@ def _get_sorted_inds( ) @_performance_tracking - def _split(self, splits): + def _split(self, splits: list[int]) -> list[Self]: """Split a frame with split points in ``splits``. Returns a list of Frames of length `len(splits) + 1`. """ return [ - self._from_columns_like_self( - libcudf.copying.columns_split(list(self._columns), splits)[ - split_idx - ], - self._column_names, - ) - for split_idx in range(len(splits) + 1) + self._from_columns_like_self(split, self._column_names) + for split in copying.columns_split(self._columns, splits) ] @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 965ba7face7..e8a9e599cb0 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,6 +16,7 @@ import pylibcudf as plc import cudf +import cudf.core._internals from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby from cudf._lib.types import size_type_dtype @@ -430,7 +431,9 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: ] ) - group_keys = libcudf.stream_compaction.drop_duplicates(group_keys) + group_keys = cudf.core._internals.stream_compaction.drop_duplicates( + group_keys + ) if len(group_keys) > 1: index = cudf.MultiIndex.from_arrays(group_keys) else: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8d3ef1036d1..eac5b9d71ae 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -31,6 +31,7 @@ ) from cudf.core._base_index import BaseIndex, _return_get_indexer_result from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import copying from cudf.core._internals.search import search_sorted from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( @@ -1371,7 +1372,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) indices = libcudf.column.Column.from_pylibcudf(right_plc) - result = libcudf.copying.scatter([indices], scatter_map, [result])[0] + result = copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e58a8bd179e..6854cb02aa5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -27,6 +27,7 @@ import cudf import cudf._lib as libcudf import cudf.core +import cudf.core._internals import cudf.core.algorithms from cudf.api.extensions import no_default from cudf.api.types import ( @@ -37,6 +38,7 @@ ) from cudf.core._base_index import BaseIndex from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import copying from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, NumericalColumn, as_column from cudf.core.column_accessor import ColumnAccessor @@ -2950,10 +2952,10 @@ def _gather( if not gather_map.nullify and len(self) != gather_map.nrows: raise IndexError("Gather map is out of bounds") return self._from_columns_like_self( - libcudf.copying.gather( - list(self.index._columns + self._columns) + copying.gather( + itertools.chain(self.index._columns, self._columns) if keep_index - else list(self._columns), + else self._columns, gather_map.column, nullify=gather_map.nullify, ), @@ -3033,16 +3035,24 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: keep_index=keep_index, ) - columns_to_slice = [ - *( - self.index._columns - if keep_index and not has_range_index - else [] - ), - *self._columns, - ] + columns_to_slice = ( + itertools.chain(self.index._columns, self._columns) + if keep_index and not has_range_index + else self._columns + ) + with acquire_spill_lock(): + plc_tables = plc.copying.slice( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_to_slice] + ), + [start, stop], + ) + sliced = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_tables[0].columns() + ] result = self._from_columns_like_self( - libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0], + sliced, self._column_names, None if has_range_index or not keep_index else self.index.names, ) @@ -3052,21 +3062,21 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: return result def _positions_from_column_names( - self, column_names, offset_by_index_columns=False - ): + self, + column_names: set[abc.Hashable], + offset_by_index_columns: bool = True, + ) -> list[int]: """Map each column name into their positions in the frame. Return positions of the provided column names, offset by the number of index columns if `offset_by_index_columns` is True. The order of indices returned corresponds to the column order in this Frame. """ - num_index_columns = ( - len(self.index._data) if offset_by_index_columns else 0 - ) + start = self.index.nlevels if offset_by_index_columns else 0 return [ - i + num_index_columns - for i, name in enumerate(self._column_names) - if name in set(column_names) + i + for i, name in enumerate(self._column_names, start=start) + if name in column_names ] def drop_duplicates( @@ -3103,7 +3113,7 @@ def drop_duplicates( subset, offset_by_index_columns=not ignore_index ) return self._from_columns_like_self( - libcudf.stream_compaction.drop_duplicates( + cudf.core._internals.stream_compaction.drop_duplicates( list(self._columns) if ignore_index else list(self.index._columns + self._columns), @@ -3116,7 +3126,9 @@ def drop_duplicates( ) @_performance_tracking - def duplicated(self, subset=None, keep="first"): + def duplicated( + self, subset=None, keep: Literal["first", "last", False] = "first" + ) -> cudf.Series: """ Return boolean Series denoting duplicate rows. @@ -3216,10 +3228,25 @@ def duplicated(self, subset=None, keep="first"): name = self.name else: columns = [self._data[n] for n in subset] - distinct = libcudf.stream_compaction.distinct_indices( - columns, keep=keep - ) - result = libcudf.copying.scatter( + + _keep_options = { + "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + } + + if (keep_option := _keep_options.get(keep)) is None: + raise ValueError('keep must be either "first", "last" or False') + + with acquire_spill_lock(): + plc_column = plc.stream_compaction.distinct_indices( + plc.Table([col.to_pylibcudf(mode="read") for col in columns]), + keep_option, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + distinct = libcudf.column.Column.from_pylibcudf(plc_column) + result = copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, [as_column(True, length=len(self), dtype=bool)], @@ -3228,14 +3255,26 @@ def duplicated(self, subset=None, keep="first"): return cudf.Series._from_column(result, index=self.index, name=name) @_performance_tracking - def _empty_like(self, keep_index=True) -> Self: + def _empty_like(self, keep_index: bool = True) -> Self: + with acquire_spill_lock(): + plc_table = plc.copying.empty_like( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in ( + itertools.chain(self.index._columns, self._columns) + if keep_index + else self._columns + ) + ] + ) + ) + columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] result = self._from_columns_like_self( - libcudf.copying.columns_empty_like( - [ - *(self.index._columns if keep_index else ()), - *self._columns, - ] - ), + columns, self._column_names, self.index.names if keep_index else None, ) @@ -3243,25 +3282,24 @@ def _empty_like(self, keep_index=True) -> Self: result._data.rangeindex = self._data.rangeindex return result - def _split(self, splits, keep_index=True): + def _split(self, splits, keep_index: bool = True) -> list[Self]: if self._num_rows == 0: return [] - columns_split = libcudf.copying.columns_split( - [ - *(self.index._columns if keep_index else []), - *self._columns, - ], + columns_split = copying.columns_split( + itertools.chain(self.index._columns, self._columns) + if keep_index + else self._columns, splits, ) return [ self._from_columns_like_self( - columns_split[i], + split, self._column_names, self.index.names if keep_index else None, ) - for i in range(len(splits) + 1) + for split in columns_split ] @_performance_tracking @@ -4331,12 +4369,10 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( - libcudf.stream_compaction.drop_nulls( + cudf.core._internals.stream_compaction.drop_nulls( [*self.index._columns, *data_columns], how=how, - keys=self._positions_from_column_names( - subset, offset_by_index_columns=True - ), + keys=self._positions_from_column_names(subset), thresh=thresh, ), self._column_names, @@ -4356,7 +4392,7 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): f"{len(boolean_mask.column)} not {len(self)}" ) return self._from_columns_like_self( - libcudf.stream_compaction.apply_boolean_mask( + cudf.core._internals.stream_compaction.apply_boolean_mask( list(self.index._columns + self._columns) if keep_index else list(self._columns), @@ -6267,17 +6303,16 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) - def _preprocess_subset(self, subset): + def _preprocess_subset(self, subset) -> set[abc.Hashable]: if subset is None: subset = self._column_names elif ( - not np.iterable(subset) - or isinstance(subset, str) + is_scalar(subset) or isinstance(subset, tuple) and subset in self._column_names ): subset = (subset,) - diff = set(subset) - set(self._data) + diff = set(subset) - set(self._column_names) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") return subset diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index e7ea91c1f21..6e965ceca66 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from __future__ import annotations -import itertools from typing import Any import pylibcudf as plc @@ -243,20 +242,12 @@ def _gather_maps(self, left_cols, right_cols): # To reorder maps so that they are in order of the input # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. - key_order = list( - itertools.chain.from_iterable( - libcudf.copying.gather( - [ - cudf.core.column.as_column( - range(n), dtype=size_type_dtype - ) - ], - map_, - nullify=null, - ) - for map_, n, null in zip(maps, lengths, nullify) + key_order = [ + cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + map_, nullify=null, check_bounds=False ) - ) + for map_, n, null in zip(maps, lengths, nullify) + ] return sorting.sort_by_key( list(maps), # If how is right, right map is primary sort key. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bc280f3d213..1e613e49ffc 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -22,7 +22,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result -from cudf.core._internals import sorting +from cudf.core._internals import copying, sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -200,10 +200,8 @@ def __init__( if lo == -1: # Now we can gather and insert null automatically code[code == -1] = np.iinfo(size_type_dtype).min - result_col = libcudf.copying.gather( - [level._column], code, nullify=True - ) - source_data[i] = result_col[0]._with_type_metadata(level.dtype) + result_col = level._column.take(code, nullify=True) + source_data[i] = result_col._with_type_metadata(level.dtype) super().__init__(ColumnAccessor(source_data)) self._levels = new_levels @@ -1934,7 +1932,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) indices = libcudf.column.Column.from_pylibcudf(right_plc) - result = libcudf.copying.scatter([indices], scatter_map, [result])[0] + result = copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index ffcb5d273a0..0abd42d4d4e 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1030,7 +1030,8 @@ def as_tuple(x): { name: idx._column for name, idx in zip( - names, target._split(range(nrows, new_size, nrows)) + names, + target._split(list(range(nrows, new_size, nrows))), ) } ) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index f6d0664758f..9c8da020ddc 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -380,7 +380,7 @@ def where(self, cond, other=None, inplace=False): source_col=self._column, other=other, inplace=inplace ) - result = cudf._lib.copying.copy_if_else(input_col, other, cond) + result = input_col.copy_if_else(other, cond) return result._with_type_metadata(self.dtype) @_performance_tracking diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 40348461f8c..6d3dc2dc7d9 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -8,7 +8,6 @@ import pandas as pd import cudf -from cudf import _lib as libcudf from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core._internals import unary from cudf.core.column import as_column @@ -251,9 +250,9 @@ def _convert_str_col( return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value] else: if errors == "coerce": - converted_col = libcudf.string_casting.stod(converted_col) non_numerics = is_float.unary_operator("not") converted_col[non_numerics] = None + converted_col = converted_col.astype(np.dtype(np.float64)) # type: ignore[assignment] return converted_col # type: ignore[return-value] else: raise ValueError("Unable to convert some strings to numerics.") diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index 11730e98c95..4966cdb86e1 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -33,11 +33,18 @@ def read_avro( if not isinstance(skip_rows, int) or skip_rows < 0: raise TypeError("skip_rows must be an int >= 0") - plc_result = plc.io.avro.read_avro( - plc.io.types.SourceInfo([filepath_or_buffer]), - columns, - skip_rows, - num_rows, + options = ( + plc.io.avro.AvroReaderOptions.builder( + plc.io.types.SourceInfo([filepath_or_buffer]) + ) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() ) + if columns is not None and len(columns) > 0: + options.set_columns(columns) + + plc_result = plc.io.avro.read_avro(options) + return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result)) diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index fe8e446f9c0..3b3fd5f7c56 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,13 +1,14 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations +import pylibcudf as plc import cudf -from cudf._lib import interop as libdlpack from cudf.core.column import ColumnBase from cudf.utils import ioutils -def from_dlpack(pycapsule_obj): +def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: @@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj): cuDF from_dlpack() assumes column-major (Fortran order) input. If the input tensor is row-major, transpose it before passing it to this function. """ + plc_table = plc.interop.from_dlpack(pycapsule_obj) + data = dict( + enumerate( + (ColumnBase.from_pylibcudf(col) for col in plc_table.columns()) + ) + ) - columns = libdlpack.from_dlpack(pycapsule_obj) - data = dict(enumerate(columns)) - - if len(columns) == 1: + if len(data) == 1: return cudf.Series._from_data(data) else: return cudf.DataFrame._from_data(data) @ioutils.doc_to_dlpack() -def to_dlpack(cudf_obj): +def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): """Converts a cuDF object to a DLPack tensor. DLPack is an open-source memory tensor structure: @@ -80,13 +84,14 @@ def to_dlpack(cudf_obj): if any( not cudf.api.types._is_non_decimal_numeric_dtype(dtype) - for _, dtype in gdf._dtypes + for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [dtype for _, dtype in gdf._dtypes] + [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) - - return libdlpack.to_dlpack([*gdf._columns]) + return plc.interop.to_dlpack( + plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns]) + ) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 89af00c713d..e0c9e535e6f 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -91,11 +91,6 @@ def read_json( if dtype is None: dtype = True - if kwargs: - raise ValueError( - "cudf engine doesn't support the " - f"following keyword arguments: {list(kwargs.keys())}" - ) if args: raise ValueError( "cudf engine doesn't support the " @@ -198,6 +193,7 @@ def read_json( mixed_types_as_string=mixed_types_as_string, prune_columns=prune_columns, recovery_mode=c_on_bad_lines, + extra_parameters=kwargs, ) df = cudf.DataFrame._from_data( @@ -291,21 +287,25 @@ def _plc_write_json( rows_per_chunk: int = 1024 * 64, # 64K rows ) -> None: try: - plc.io.json.write_json( - plc.io.SinkInfo([path_or_buf]), - plc.io.TableWithMetadata( - plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ), - colnames, + tbl_w_meta = plc.io.TableWithMetadata( + plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] ), - na_rep, - include_nulls, - lines, - rows_per_chunk, - true_value="true", - false_value="false", + colnames, + ) + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl + ) + .metadata(tbl_w_meta) + .na_rep(na_rep) + .include_nulls(include_nulls) + .lines(lines) + .build() ) + if rows_per_chunk != np.iinfo(np.int32).max: + options.set_rows_per_chunk(rows_per_chunk) + plc.io.json.write_json(options) except OverflowError as err: raise OverflowError( f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 153ee0fa01a..c13489630a3 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -25,9 +25,7 @@ from cudf._lib.column import Column from cudf._lib.utils import ( _data_from_columns, - _index_level_name, data_from_pylibcudf_io, - generate_pandas_metadata, ) from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock @@ -128,7 +126,7 @@ def _plc_write_parquet( tbl_meta = plc.io.types.TableInputMetadata(plc_table) for level, idx_name in enumerate(table.index.names): tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) + ioutils._index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table.index.names) else: @@ -162,7 +160,7 @@ def _plc_write_parquet( if partitions_info is not None: user_data = [ { - "pandas": generate_pandas_metadata( + "pandas": ioutils.generate_pandas_metadata( table.iloc[start_row : start_row + num_row].copy( deep=False ), @@ -172,7 +170,9 @@ def _plc_write_parquet( for start_row, num_row in partitions_info ] else: - user_data = [{"pandas": generate_pandas_metadata(table, index)}] + user_data = [ + {"pandas": ioutils.generate_pandas_metadata(table, index)} + ] if header_version not in ("1.0", "2.0"): raise ValueError( @@ -1737,7 +1737,7 @@ def _initialize_chunked_state( False if isinstance(table.index, cudf.RangeIndex) else self.index ) user_data = [ - {"pandas": generate_pandas_metadata(table, index)} + {"pandas": ioutils.generate_pandas_metadata(table, index)} ] * num_partitions comp_type = _get_comp_type(self.compression) stat_freq = _get_stat_freq(self.statistics) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index db24fdd2a29..8e1dba858c3 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -950,3 +950,13 @@ def test_index_set_categories(ordered): expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) assert_eq(result, expected) + + +def test_categorical_interval_pandas_roundtrip(): + expected = cudf.Series(cudf.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected.to_pandas()) + assert_eq(result, expected) + + expected = pd.Series(pd.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected).to_pandas() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 260b481b933..da0aa5be6f5 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -10,7 +10,6 @@ import cudf from cudf import NA -from cudf._lib.copying import get_element from cudf.api.types import is_scalar from cudf.core.column.column import column_empty from cudf.testing import assert_eq @@ -715,9 +714,8 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level): ], ) def test_list_scalar_device_construction(data): - col = cudf.Series([data])._column - slr = get_element(col, 0) - assert slr.value == data + res = cudf.Series([data])._column.element_indexing(0) + assert res == data @pytest.mark.parametrize("nesting_level", [1, 2, 3]) @@ -729,10 +727,8 @@ def test_list_scalar_device_construction_null(nesting_level): arrow_type = pa.infer_type(data) arrow_arr = pa.array([None], type=arrow_type) - col = cudf.Series(arrow_arr)._column - slr = get_element(col, 0) - - assert slr.value is cudf.NA + res = cudf.Series(arrow_arr)._column.element_indexing(0) + assert res is cudf.NA @pytest.mark.parametrize("input_obj", [[[1, NA, 3]], [[1, NA, 3], [4, 5, NA]]]) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py deleted file mode 100644 index b474bbe9bd8..00000000000 --- a/python/cudf/cudf/tests/test_pack.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pickle -import sys - -import numpy as np -import pandas as pd - -from cudf import DataFrame, Index, Series -from cudf._lib.copying import pack, unpack -from cudf.testing import assert_eq - - -def test_sizeof_packed_dataframe(): - rng = np.random.default_rng(seed=0) - df = DataFrame() - nelem = 1000 - df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = rng.random(nelem) - packed = pack(df) - - nbytes = hkeys.nbytes + hvals.nbytes - sizeof = sys.getsizeof(packed) - assert sizeof < nbytes - - serialized_nbytes = len( - pickle.dumps(packed, protocol=pickle.HIGHEST_PROTOCOL) - ) - - # assert at least sizeof bytes were serialized - assert serialized_nbytes >= sizeof - - -def check_packed_equality(df): - # basic - assert_packed_frame_equality(df) - # sliced - assert_packed_frame_equality(df[:-1]) - assert_packed_frame_equality(df[1:]) - assert_packed_frame_equality(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_equality(sortvaldf) - - -def assert_packed_frame_equality(df): - pdf = df.to_pandas() - - packed = pack(df) - del df - unpacked = unpack(packed) - - assert_eq(unpacked, pdf) - - -def test_packed_dataframe_equality_numeric(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = rng.random(nelem) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_categorical(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = rng.random(len(df)) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_list(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = rng.random(len(df)) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_struct(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = rng.random(len(df)) - - check_packed_equality(df) - - -def check_packed_unique_pointers(df): - # basic - assert_packed_frame_unique_pointers(df) - # sliced - assert_packed_frame_unique_pointers(df[:-1]) - assert_packed_frame_unique_pointers(df[1:]) - assert_packed_frame_unique_pointers(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_unique_pointers(sortvaldf) - - -def assert_packed_frame_unique_pointers(df): - unpacked = unpack(pack(df)) - - for col in df: - if df._data[col].data: - assert df._data[col].data.get_ptr(mode="read") != unpacked._data[ - col - ].data.get_ptr(mode="read") - - -def test_packed_dataframe_unique_pointers_numeric(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = rng.random(nelem) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_categorical(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = rng.random(len(df)) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_list(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = rng.random(len(df)) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_struct(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = rng.random(len(df)) - - check_packed_unique_pointers(df) - - -def check_packed_pickled_equality(df): - # basic - assert_packed_frame_picklable(df) - # sliced - assert_packed_frame_picklable(df[:-1]) - assert_packed_frame_picklable(df[1:]) - assert_packed_frame_picklable(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_picklable(sortvaldf) - # out-of-band - buffers = [] - serialbytes = pickle.dumps( - pack(df), protocol=5, buffer_callback=buffers.append - ) - for b in buffers: - assert isinstance(b, pickle.PickleBuffer) - loaded = unpack(pickle.loads(serialbytes, buffers=buffers)) - assert_eq(loaded, df) - - -def assert_packed_frame_picklable(df): - serialbytes = pickle.dumps(pack(df)) - loaded = unpack(pickle.loads(serialbytes)) - assert_eq(loaded, df) - - -def test_pickle_packed_dataframe_numeric(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = rng.random(nelem) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_categorical(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = rng.random(len(df)) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_list(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = rng.random(len(df)) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_struct(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = rng.random(len(df)) - - check_packed_pickled_equality(df) - - -def check_packed_serialized_equality(df): - # basic - assert_packed_frame_serializable(df) - # sliced - assert_packed_frame_serializable(df[:-1]) - assert_packed_frame_serializable(df[1:]) - assert_packed_frame_serializable(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_serializable(sortvaldf) - - -def assert_packed_frame_serializable(df): - packed = pack(df) - header, frames = packed.serialize() - loaded = unpack(packed.deserialize(header, frames)) - assert_eq(loaded, df) - - -def test_serialize_packed_dataframe_numeric(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = rng.random(nelem) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_categorical(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = rng.random(len(df)) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_list(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = rng.random(len(df)) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_struct(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = rng.random(len(df)) - - check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index fcd98831686..c14fab4040b 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -10,10 +10,11 @@ import pytest from packaging import version +import pylibcudf as plc import rmm import cudf -from cudf._lib.copying import get_element +from cudf.core.buffer import acquire_spill_lock from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, @@ -143,8 +144,14 @@ def test_scalar_host_initialization(value): @pytest.mark.parametrize("value", SCALAR_VALUES) def test_scalar_device_initialization(value): column = cudf.Series([value], nan_as_null=False)._column - dev_slr = get_element(column, 0) - + with acquire_spill_lock(): + dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf( + plc.copying.get_element( + column.to_pylibcudf(mode="read"), + 0, + ), + dtype=column.dtype, + ) s = cudf.Scalar.from_device_scalar(dev_slr) assert s._is_device_value_current @@ -164,8 +171,14 @@ def test_scalar_device_initialization(value): def test_scalar_device_initialization_decimal(value, decimal_type): dtype = decimal_type._from_decimal(value) column = cudf.Series([str(value)]).astype(dtype)._column - dev_slr = get_element(column, 0) - + with acquire_spill_lock(): + dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf( + plc.copying.get_element( + column.to_pylibcudf(mode="read"), + 0, + ), + dtype=column.dtype, + ) s = cudf.Scalar.from_device_scalar(dev_slr) assert s._is_device_value_current diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 962a229a839..fceaaf185e8 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -714,21 +714,35 @@ def read_csv( storage_options=None, **kwargs, ): - import dask_expr as dx - from fsspec.utils import stringify_path + try: + # TODO: Remove when cudf is pinned to dask>2024.12.0 + import dask_expr as dx + from dask_expr.io.csv import ReadCSV + from fsspec.utils import stringify_path + + if not isinstance(path, str): + path = stringify_path(path) + return dx.new_collection( + ReadCSV( + path, + dtype_backend=dtype_backend, + storage_options=storage_options, + kwargs=kwargs, + header=header, + dataframe_backend="cudf", + ) + ) + except ImportError: + # Requires dask>2024.12.0 + from dask_cudf.io.csv import read_csv - if not isinstance(path, str): - path = stringify_path(path) - return dx.new_collection( - dx.io.csv.ReadCSV( + return read_csv( path, - dtype_backend=dtype_backend, - storage_options=storage_options, - kwargs=kwargs, + *args, header=header, - dataframe_backend="cudf", + storage_options=storage_options, + **kwargs, ) - ) @staticmethod def read_json(*args, **kwargs): diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index b22b31a591f..29f98b14511 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,8 +1,193 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +import os +from glob import glob +from warnings import warn -read_csv = _deprecated_api( - "dask_cudf.io.csv.read_csv", - new_api="dask_cudf.read_csv", -) +from fsspec.utils import infer_compression + +from dask import dataframe as dd +from dask.dataframe.io.csv import make_reader +from dask.utils import parse_bytes + +import cudf + + +def read_csv(path, blocksize="default", **kwargs): + """ + Read CSV files into a :class:`.DataFrame`. + + This API parallelizes the :func:`cudf:cudf.read_csv` function in + the following ways: + + It supports loading many files at once using globstrings: + + >>> import dask_cudf + >>> df = dask_cudf.read_csv("myfiles.*.csv") + + In some cases it can break up large files: + + >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") + + It can read CSV files from external resources (e.g. S3, HTTP, FTP) + + >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") + >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") + + Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and + supports many of the same keyword arguments with the same + performance guarantees. See the docstring for + :func:`cudf:cudf.read_csv` for more information on available + keyword arguments. + + Parameters + ---------- + path : str, path object, or file-like object + Either a path to a file (a str, :py:class:`pathlib.Path`, or + py._path.local.LocalPath), URL (including http, ftp, and S3 + locations), or any object with a read() method (such as + builtin :py:func:`open` file handler function or + :py:class:`~io.StringIO`). + blocksize : int or str, default "256 MiB" + The target task partition size. If ``None``, a single block + is used for each file. + **kwargs : dict + Passthrough key-word arguments that are sent to + :func:`cudf:cudf.read_csv`. + + Notes + ----- + If any of `skipfooter`/`skiprows`/`nrows` are passed, + `blocksize` will default to None. + + Examples + -------- + >>> import dask_cudf + >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) + >>> ddf.compute() + a b + 0 1 hi + 1 2 hello + 2 3 ai + + """ + # Set default `blocksize` + if blocksize == "default": + if ( + kwargs.get("skipfooter", 0) != 0 + or kwargs.get("skiprows", 0) != 0 + or kwargs.get("nrows", None) is not None + ): + # Cannot read in blocks if skipfooter, + # skiprows or nrows is passed. + blocksize = None + else: + blocksize = "256 MiB" + + if "://" in str(path): + func = make_reader(cudf.read_csv, "read_csv", "CSV") + return func(path, blocksize=blocksize, **kwargs) + else: + return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) + + +def _internal_read_csv(path, blocksize="256 MiB", **kwargs): + if isinstance(blocksize, str): + blocksize = parse_bytes(blocksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + compression = kwargs.get("compression", "infer") + + if compression == "infer": + # Infer compression from first path by default + compression = infer_compression(filenames[0]) + + if compression and blocksize: + # compressed CSVs reading must read the entire file + kwargs.pop("byte_range", None) + warn( + "Warning %s compression does not support breaking apart files\n" + "Please ensure that each individual file can fit in memory and\n" + "use the keyword ``blocksize=None to remove this message``\n" + "Setting ``blocksize=(size of file)``" % compression + ) + blocksize = None + + if blocksize is None: + return read_csv_without_blocksize(path, **kwargs) + + # Let dask.dataframe generate meta + dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") + kwargs1 = kwargs.copy() + usecols = kwargs1.pop("usecols", None) + dtype = kwargs1.pop("dtype", None) + meta = dask_reader(filenames[0], **kwargs1)._meta + names = meta.columns + if usecols or dtype: + # Regenerate meta with original kwargs if + # `usecols` or `dtype` was specified + meta = dask_reader(filenames[0], **kwargs)._meta + + i = 0 + path_list = [] + kwargs_list = [] + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, blocksize): + kwargs2 = kwargs.copy() + kwargs2["byte_range"] = ( + start, + blocksize, + ) # specify which chunk of the file we care about + if start != 0: + kwargs2["names"] = names # no header in the middle of the file + kwargs2["header"] = None + path_list.append(fn) + kwargs_list.append(kwargs2) + i += 1 + + return dd.from_map(_read_csv, path_list, kwargs_list, meta=meta) + + +def _read_csv(fn, kwargs): + return cudf.read_csv(fn, **kwargs) + + +def read_csv_without_blocksize(path, **kwargs): + """Read entire CSV with optional compression (gzip/zip) + + Parameters + ---------- + path : str + path to files (support for glob) + """ + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + meta_kwargs = kwargs.copy() + if "skipfooter" in meta_kwargs: + meta_kwargs.pop("skipfooter") + if "nrows" in meta_kwargs: + meta_kwargs.pop("nrows") + # Read "head" of first file (first 5 rows). + # Convert to empty df for metadata. + meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] + return dd.from_map(cudf.read_csv, filenames, meta=meta, **kwargs) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index a0acb86f5a9..ddfd1c1adac 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -185,11 +185,6 @@ def test_read_csv_blocksize_none(tmp_path, compression, size): df2 = dask_cudf.read_csv(path, blocksize=None, dtype=typ) dd.assert_eq(df, df2) - # Test chunksize deprecation - with pytest.warns(FutureWarning, match="deprecated"): - df3 = dask_cudf.read_csv(path, chunksize=None, dtype=typ) - dd.assert_eq(df, df3) - @pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None]) def test_csv_reader_usecols(tmp_path, dtype): @@ -275,7 +270,3 @@ def test_deprecated_api_paths(tmp_path): with pytest.warns(match="dask_cudf.io.read_csv is now deprecated"): df2 = dask_cudf.io.read_csv(csv_path) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.csv.read_csv is now deprecated"): - df2 = dask_cudf.io.csv.read_csv(csv_path) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd index 8696fcb3c15..a0fca95d459 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pxd +++ b/python/pylibcudf/pylibcudf/io/avro.pxd @@ -1,12 +1,23 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.io.types cimport SourceInfo, TableWithMetadata -from pylibcudf.libcudf.io.avro cimport avro_reader_options +from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder from pylibcudf.libcudf.types cimport size_type -cpdef TableWithMetadata read_avro( - SourceInfo source_info, - list columns = *, - size_type skip_rows = *, - size_type num_rows = * -) +from pylibcudf.libcudf.types cimport size_type + +cdef class AvroReaderOptions: + cdef avro_reader_options c_obj + cdef SourceInfo source + cpdef void set_columns(self, list col_names) + + +cdef class AvroReaderOptionsBuilder: + cdef avro_reader_options_builder c_obj + cdef SourceInfo source + cpdef AvroReaderOptionsBuilder columns(self, list col_names) + cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows) + cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows) + cpdef AvroReaderOptions build(self) + +cpdef TableWithMetadata read_avro(AvroReaderOptions options) diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi index 49c2f083702..8cafc9a6573 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyi +++ b/python/pylibcudf/pylibcudf/io/avro.pyi @@ -1,11 +1,16 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.io.types import SourceInfo, TableWithMetadata -__all__ = ["read_avro"] - -def read_avro( - source_info: SourceInfo, - columns: list[str] | None = None, - skip_rows: int = 0, - num_rows: int = -1, -) -> TableWithMetadata: ... +__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"] + +class AvroReaderOptions: + @staticmethod + def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ... + +class AvroReaderOptionsBuilder: + def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ... + def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ... + def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ... + def build(self) -> AvroReaderOptions: ... + +def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ... diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 4271333511a..c378fca0415 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport ( ) from pylibcudf.libcudf.types cimport size_type -__all__ = ["read_avro"] +__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"] + + +cdef class AvroReaderOptions: + """ + The settings to use for ``read_avro`` + For details, see :cpp:class:`cudf::io::avro_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a AvroWriterOptionsBuilder object + + For details, see :cpp:func:`cudf::io::avro_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the Avro file from. + + Returns + ------- + AvroReaderOptionsBuilder + Builder to build AvroReaderOptions + """ + cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__( + AvroReaderOptionsBuilder + ) + avro_builder.c_obj = avro_reader_options.builder(source.c_obj) + avro_builder.source = source + return avro_builder + + cpdef void set_columns(self, list col_names): + """ + Set names of the column to be read. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.set_columns(vec) + + +cdef class AvroReaderOptionsBuilder: + cpdef AvroReaderOptionsBuilder columns(self, list col_names): + """ + Set names of the column to be read. + + Parameters + ---------- + col_names : list + List of column names + + Returns + ------- + AvroReaderOptionsBuilder + """ + cdef vector[string] vec + vec.reserve(len(col_names)) + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.columns(vec) + return self + + cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows): + """ + Sets number of rows to skip. + + Parameters + ---------- + skip_rows : size_type + Number of rows to skip from start + + Returns + ------- + AvroReaderOptionsBuilder + """ + self.c_obj.skip_rows(skip_rows) + return self + + cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows): + """ + Sets number of rows to read. + + Parameters + ---------- + num_rows : size_type + Number of rows to read after skip + + Returns + ------- + AvroReaderOptionsBuilder + """ + self.c_obj.num_rows(num_rows) + return self + + cpdef AvroReaderOptions build(self): + """Create a AvroReaderOptions object""" + cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__( + AvroReaderOptions + ) + avro_options.c_obj = move(self.c_obj.build()) + avro_options.source = self.source + return avro_options cpdef TableWithMetadata read_avro( - SourceInfo source_info, - list columns = None, - size_type skip_rows = 0, - size_type num_rows = -1 + AvroReaderOptions options ): """ - Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. + Read from Avro format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_avro`. Parameters ---------- - source_info: SourceInfo - The SourceInfo object to read the avro dataset from. - columns: list, default None - Optional columns to read, if not provided, reads all columns in the file. - skip_rows: size_type, default 0 - The number of rows to skip. - num_rows: size_type, default -1 - The number of rows to read, after skipping rows. - If -1 is passed, all rows will be read. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: AvroReaderOptions + Settings for controlling reading behavior """ - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options avro_opts = ( - avro_reader_options.builder(source_info.c_obj) - .columns(c_columns) - .skip_rows(skip_rows) - .num_rows(num_rows) - .build() - ) - with nogil: - c_result = move(cpp_read_avro(avro_opts)) + c_result = move(cpp_read_avro(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index f65c1034598..4894ca3bd6e 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -6,8 +6,13 @@ from pylibcudf.io.types cimport ( TableWithMetadata, compression_type, ) -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t +from pylibcudf.libcudf.io.json cimport ( + json_recovery_mode_t, + json_writer_options, + json_writer_options_builder, +) from pylibcudf.libcudf.types cimport size_type +from pylibcudf.table cimport Table cpdef TableWithMetadata read_json( @@ -21,19 +26,28 @@ cpdef TableWithMetadata read_json( bool mixed_types_as_string = *, bool prune_columns = *, json_recovery_mode_t recovery_mode = *, + dict extra_parameters = *, ) +cdef class JsonWriterOptions: + cdef json_writer_options c_obj + cdef SinkInfo sink + cdef Table table + cpdef void set_rows_per_chunk(self, size_type val) + cpdef void set_true_value(self, str val) + cpdef void set_false_value(self, str val) -cpdef void write_json( - SinkInfo sink_info, - TableWithMetadata tbl, - str na_rep = *, - bool include_nulls = *, - bool lines = *, - size_type rows_per_chunk = *, - str true_value = *, - str false_value = * -) +cdef class JsonWriterOptionsBuilder: + cdef json_writer_options_builder c_obj + cdef SinkInfo sink + cdef Table table + cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta) + cpdef JsonWriterOptionsBuilder na_rep(self, str val) + cpdef JsonWriterOptionsBuilder include_nulls(self, bool val) + cpdef JsonWriterOptionsBuilder lines(self, bool val) + cpdef JsonWriterOptions build(self) + +cpdef void write_json(JsonWriterOptions options) cpdef tuple chunked_read_json( SourceInfo source_info, diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index b2bc6a43700..e0489742cd0 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping from typing import TypeAlias +from typing_extensions import Self + from pylibcudf.column import Column from pylibcudf.io.types import ( CompressionType, @@ -10,6 +12,7 @@ from pylibcudf.io.types import ( SourceInfo, TableWithMetadata, ) +from pylibcudf.table import Table from pylibcudf.types import DataType ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] @@ -28,16 +31,22 @@ def read_json( prune_columns: bool = False, recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, ) -> TableWithMetadata: ... -def write_json( - sink_info: SinkInfo, - table_w_meta: TableWithMetadata, - na_rep: str = "", - include_nulls: bool = False, - lines: bool = False, - rows_per_chunk: int = 2**32 - 1, - true_value: str = "true", - false_value: str = "false", -) -> None: ... + +class JsonWriterOptions: + @staticmethod + def builder(sink: SinkInfo, table: Table) -> JsonWriterOptionsBuilder: ... + def set_rows_per_chunk(self, val: int) -> None: ... + def set_true_value(self, val: str) -> None: ... + def set_false_value(self, val: str) -> None: ... + +class JsonWriterOptionsBuilder: + def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_nulls(self, val: bool) -> Self: ... + def lines(self, val: bool) -> Self: ... + def build(self) -> JsonWriterOptions: ... + +def write_json(options: JsonWriterOptions) -> None: ... def chunked_read_json( source_info: SourceInfo, dtypes: list[NameAndType] | None = None, diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index ad2989925c9..16078b31566 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.limits cimport numeric_limits from libcpp.map cimport map from libcpp.string cimport string from libcpp.utility cimport move @@ -17,13 +16,18 @@ from pylibcudf.libcudf.io.json cimport ( ) from pylibcudf.libcudf.io.types cimport ( compression_type, - table_metadata, table_with_metadata, ) from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType -__all__ = ["chunked_read_json", "read_json", "write_json"] +__all__ = [ + "chunked_read_json", + "read_json", + "write_json", + "JsonWriterOptions", + "JsonWriterOptionsBuilder" +] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map @@ -57,8 +61,10 @@ cdef json_reader_options _setup_json_reader_options( bool keep_quotes, bool mixed_types_as_string, bool prune_columns, - json_recovery_mode_t recovery_mode): + json_recovery_mode_t recovery_mode, + dict extra_parameters=None): + cdef vector[string] na_vec cdef vector[data_type] types_vec cdef json_reader_options opts = ( json_reader_options.builder(source_info.c_obj) @@ -81,6 +87,39 @@ cdef json_reader_options _setup_json_reader_options( opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) opts.enable_prune_columns(prune_columns) + + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + if extra_parameters is not None: + for key, value in extra_parameters.items(): + if key == 'delimiter': + opts.set_delimiter(ord(value)) + elif key == 'dayfirst': + opts.enable_dayfirst(value) + elif key == 'experimental': + opts.enable_experimental(value) + elif key == 'normalize_single_quotes': + opts.enable_normalize_single_quotes(value) + elif key == 'normalize_whitespace': + opts.enable_normalize_whitespace(value) + elif key == 'strict_validation': + opts.set_strict_validation(value) + elif key == 'allow_unquoted_control_chars': + opts.allow_unquoted_control_chars(value) + elif key == 'allow_numeric_leading_zeros': + opts.allow_numeric_leading_zeros(value) + elif key == 'allow_nonnumeric_numbers': + opts.allow_nonnumeric_numbers(value) + elif key == 'na_values': + for na_val in value: + if isinstance(na_val, str): + na_vec.push_back(na_val.encode()) + opts.set_na_values(na_vec) + else: + raise ValueError( + "cudf engine doesn't support the " + f"'{key}' keyword argument for read_json" + ) return opts @@ -196,6 +235,7 @@ cpdef TableWithMetadata read_json( bool mixed_types_as_string = False, bool prune_columns = False, json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + dict extra_parameters = None, ): """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. @@ -227,6 +267,8 @@ cpdef TableWithMetadata read_json( recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL Whether to raise an error or set corresponding values to null when encountering an invalid JSON line. + extra_parameters : dict, default None + Additional hidden parameters to pass to the JSON reader. Returns ------- @@ -244,6 +286,7 @@ cpdef TableWithMetadata read_json( mixed_types_as_string=mixed_types_as_string, prune_columns=prune_columns, recovery_mode=recovery_mode, + extra_parameters=extra_parameters, ) # Read JSON @@ -255,56 +298,171 @@ cpdef TableWithMetadata read_json( return TableWithMetadata.from_libcudf(c_result) -cpdef void write_json( - SinkInfo sink_info, - TableWithMetadata table_w_meta, - str na_rep = "", - bool include_nulls = False, - bool lines = False, - size_type rows_per_chunk = numeric_limits[size_type].max(), - str true_value = "true", - str false_value = "false" -): +cdef class JsonWriterOptions: """ - Writes a :py:class:`~pylibcudf.table.Table` to JSON format. + The settings to use for ``write_json`` - Parameters - ---------- - sink_info: SinkInfo - The SinkInfo object to write the JSON to. - table_w_meta: TableWithMetadata - The TableWithMetadata object containing the Table to write - na_rep: str, default "" - The string representation for null values. - include_nulls: bool, default False + For details, see :cpp:class:`cudf::io::json_writer_options` + """ + @staticmethod + def builder(SinkInfo sink, Table table): + """ + Create a JsonWriterOptionsBuilder object + + Parameters + ---------- + sink : SinkInfo + The sink used for writer output + table : Table + Table to be written to output + + Returns + ------- + JsonWriterOptionsBuilder + Builder to build JsonWriterOptions + """ + cdef JsonWriterOptionsBuilder json_builder = ( + JsonWriterOptionsBuilder.__new__(JsonWriterOptionsBuilder) + ) + json_builder.c_obj = json_writer_options.builder(sink.c_obj, table.view()) + json_builder.sink = sink + json_builder.table = table + return json_builder + + cpdef void set_rows_per_chunk(self, size_type val): + """ + Sets string to used for null entries. + + Parameters + ---------- + val : size_type + String to represent null value + + Returns + ------- + None + """ + self.c_obj.set_rows_per_chunk(val) + + cpdef void set_true_value(self, str val): + """ + Sets string used for values != 0 + + Parameters + ---------- + val : str + String to represent values != 0 + + Returns + ------- + None + """ + self.c_obj.set_true_value(val.encode()) + + cpdef void set_false_value(self, str val): + """ + Sets string used for values == 0 + + Parameters + ---------- + val : str + String to represent values == 0 + + Returns + ------- + None + """ + self.c_obj.set_false_value(val.encode()) + + +cdef class JsonWriterOptionsBuilder: + cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta): + """ + Sets optional metadata (with column names). + + Parameters + ---------- + tbl_w_meta : TableWithMetadata + Associated metadata + + Returns + ------- + Self + """ + self.c_obj.metadata(tbl_w_meta.metadata) + return self + + cpdef JsonWriterOptionsBuilder na_rep(self, str val): + """ + Sets string to used for null entries. + + Parameters + ---------- + val : str + String to represent null value + + Returns + ------- + Self + """ + self.c_obj.na_rep(val.encode()) + return self + + cpdef JsonWriterOptionsBuilder include_nulls(self, bool val): + """ Enables/Disables output of nulls as 'null'. - lines: bool, default False - If `True`, write output in the JSON lines format. - rows_per_chunk: size_type, defaults to length of the input table - The maximum number of rows to write at a time. - true_value: str, default "true" - The string representation for values != 0 in INT8 types. - false_value: str, default "false" - The string representation for values == 0 in INT8 types. + + Parameters + ---------- + val : bool + Boolean value to enable/disable + + Returns + ------- + Self + """ + self.c_obj.include_nulls(val) + return self + + cpdef JsonWriterOptionsBuilder lines(self, bool val): + """ + Enables/Disables JSON lines for records format. + + Parameters + ---------- + val : bool + Boolean value to enable/disable + + Returns + ------- + Self + """ + self.c_obj.lines(val) + return self + + cpdef JsonWriterOptions build(self): + """Create a JsonWriterOptions object""" + cdef JsonWriterOptions json_options = JsonWriterOptions.__new__( + JsonWriterOptions + ) + json_options.c_obj = move(self.c_obj.build()) + json_options.sink = self.sink + json_options.table = self.table + return json_options + + +cpdef void write_json(JsonWriterOptions options): """ - cdef table_metadata tbl_meta = table_w_meta.metadata - cdef string na_rep_c = na_rep.encode() - - cdef json_writer_options options = ( - json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view()) - .metadata(tbl_meta) - .na_rep(na_rep_c) - .include_nulls(include_nulls) - .lines(lines) - .build() - ) + Writes a set of columns to JSON format. - if rows_per_chunk != numeric_limits[size_type].max(): - options.set_rows_per_chunk(rows_per_chunk) - if true_value != "true": - options.set_true_value(true_value.encode()) - if false_value != "false": - options.set_false_value(false_value.encode()) + Parameters + ---------- + options : JsonWriterOptions + Settings for controlling writing behavior + Returns + ------- + None + """ with nogil: - cpp_write_json(options) + cpp_write_json(options.c_obj) diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd index a7ca6978621..c241c478f25 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd @@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \ cdef struct schema_element: data_type type map[string, schema_element] child_types + optional[vector[string]] column_order cpdef enum class json_recovery_mode_t(int32_t): FAIL @@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \ except +libcudf_exception_handler size_t get_byte_range_offset() except +libcudf_exception_handler size_t get_byte_range_size() except +libcudf_exception_handler + size_t get_byte_range_size_with_padding() except +libcudf_exception_handler + size_t get_byte_range_padding() except +libcudf_exception_handler + char get_delimiter() except +libcudf_exception_handler bool is_enabled_lines() except +libcudf_exception_handler bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler bool is_enabled_prune_columns() except +libcudf_exception_handler - bool is_enabled_dayfirst() except +libcudf_exception_handler bool is_enabled_experimental() except +libcudf_exception_handler + bool is_enabled_dayfirst() except +libcudf_exception_handler + bool is_enabled_keep_quotes() except +libcudf_exception_handler + bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler + bool is_enabled_normalize_whitespace() except +libcudf_exception_handler + json_recovery_mode_t recovery_mode() except +libcudf_exception_handler + bool is_strict_validation() except +libcudf_exception_handler + bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler + bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler + bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler + vector[string] get_na_values() except +libcudf_exception_handler # setter - void set_dtypes( - vector[data_type] types - ) except +libcudf_exception_handler - void set_dtypes( - map[string, schema_element] types - ) except +libcudf_exception_handler - void set_compression( - cudf_io_types.compression_type compression - ) except +libcudf_exception_handler + void set_dtypes(vector[data_type] types) except +libcudf_exception_handler + void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler + void set_dtypes(map[string, schema_element] types)\ + except +libcudf_exception_handler + void set_dtypes(schema_element types) except +libcudf_exception_handler + void set_compression(cudf_io_types.compression_type comp_type)\ + except +libcudf_exception_handler void set_byte_range_offset(size_t offset) except +libcudf_exception_handler void set_byte_range_size(size_t size) except +libcudf_exception_handler + void set_delimiter(char delimiter) except +libcudf_exception_handler void enable_lines(bool val) except +libcudf_exception_handler void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler void enable_prune_columns(bool val) except +libcudf_exception_handler - void enable_dayfirst(bool val) except +libcudf_exception_handler void enable_experimental(bool val) except +libcudf_exception_handler + void enable_dayfirst(bool val) except +libcudf_exception_handler void enable_keep_quotes(bool val) except +libcudf_exception_handler + void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler + + void enable_normalize_whitespace(bool val) except +libcudf_exception_handler + void set_recovery_mode(json_recovery_mode_t val)\ + except +libcudf_exception_handler + void set_strict_validation(bool val) except +libcudf_exception_handler + void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler + void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler + void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler + void set_na_values(vector[string] vals) except +libcudf_exception_handler @staticmethod json_reader_options_builder builder( @@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dtypes( map[string, schema_element] types ) except +libcudf_exception_handler + json_reader_options_builder& dtypes( + schema_element types + ) except +libcudf_exception_handler json_reader_options_builder& compression( cudf_io_types.compression_type compression ) except +libcudf_exception_handler @@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& byte_range_size( size_t size ) except +libcudf_exception_handler + json_reader_options_builder& delimiter( + char delimiter + ) except +libcudf_exception_handler json_reader_options_builder& lines( bool val ) except +libcudf_exception_handler @@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& prune_columns( bool val ) except +libcudf_exception_handler + json_reader_options_builder& experimental( + bool val + ) except +libcudf_exception_handler json_reader_options_builder& dayfirst( bool val ) except +libcudf_exception_handler json_reader_options_builder& keep_quotes( bool val ) except +libcudf_exception_handler + json_reader_options_builder& normalize_single_quotes( + bool val + ) except +libcudf_exception_handler + json_reader_options_builder& normalize_whitespace( + bool val + ) except +libcudf_exception_handler json_reader_options_builder& recovery_mode( json_recovery_mode_t val ) except +libcudf_exception_handler + json_reader_options_builder& strict_validation(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& numeric_leading_zeros(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& nonnumeric_numbers(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& unquoted_control_chars(bool val)\ + except +libcudf_exception_handler + json_reader_options_builder& na_values(vector[string] vals)\ + except +libcudf_exception_handler + json_reader_options build() except +libcudf_exception_handler cdef cudf_io_types.table_with_metadata read_json( diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 3d9d99ffa61..bda8921b62a 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): buffer.seek(0) res = plc.io.avro.read_avro( - plc.io.types.SourceInfo([buffer]), - columns=columns, - skip_rows=skip_rows, - num_rows=num_rows, + ( + plc.io.avro.AvroReaderOptions.builder( + plc.io.types.SourceInfo([buffer]) + ) + .columns(columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) ) expected = pa.Table.from_arrays( diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 453e5ce32a8..9b0c5a29fe8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -24,13 +24,19 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk): plc_table_w_meta, pa_table = table_data sink = source_or_sink - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_table_w_meta, - lines=lines, - rows_per_chunk=rows_per_chunk, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_table_w_meta.tbl + ) + .metadata(plc_table_w_meta) + .lines(lines) + .build() ) + options.set_rows_per_chunk(rows_per_chunk) + + plc.io.json.write_json(options) + exp = pa_table.to_pandas() # Convert everything to string to make @@ -57,13 +63,18 @@ def test_write_json_nulls(na_rep, include_nulls): sink = io.StringIO() - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_tbl_w_meta, - na_rep=na_rep, - include_nulls=include_nulls, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .metadata(plc_tbl_w_meta) + .na_rep(na_rep) + .include_nulls(include_nulls) + .build() ) + plc.io.json.write_json(options) + exp = pa_tbl.to_pandas() # Convert everything to string to make @@ -100,15 +111,21 @@ def test_write_json_bool_opts(true_value, false_value): sink = io.StringIO() - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_tbl_w_meta, - include_nulls=True, - na_rep="null", - true_value=true_value, - false_value=false_value, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .metadata(plc_tbl_w_meta) + .na_rep("null") + .include_nulls(True) + .build() ) + options.set_true_value(true_value) + options.set_false_value(false_value) + + plc.io.json.write_json(options) + exp = pa_tbl.to_pandas() # Convert everything to string to make