diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 49ca5ca0fb9..abe2fc8ed8b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -328,16 +328,11 @@ jobs: run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" telemetry-summarize: - runs-on: ubuntu-latest + # This job must use a self-hosted runner to record telemetry traces. + runs-on: linux-amd64-cpu4 needs: pr-builder if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - - name: Load stashed telemetry env vars - uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main - with: - load_service_name: true - name: Telemetry summarize - uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main - with: - cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}" + uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index c440643037b..b0c48e04710 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,11 +14,6 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) - include(${rapids-cmake-dir}/cpm/package_override.cmake) - - set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") - rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") - if(NOT BUILD_SHARED_LIBS) set(_exclude_from_all EXCLUDE_FROM_ALL FALSE) else() @@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow) nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb + GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all} ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff deleted file mode 100644 index e9a36fcb567..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff +++ /dev/null @@ -1,38 +0,0 @@ -diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h -index caa6be4..70ec8a2 100644 ---- a/src/nanoarrow/common/inline_buffer.h -+++ b/src/nanoarrow/common/inline_buffer.h -@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { - } - - static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { -- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | -+ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT - ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | - ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | - ((values[7] + 0x7f) & 0x80)); -@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l - // set bits within a single byte - const uint8_t only_byte_mask = - i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); -- bits[bytes_begin] &= only_byte_mask; -+ bits[bytes_begin] &= only_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); - return; - } - - // set/clear trailing bits of first byte -- bits[bytes_begin] &= first_byte_mask; -+ bits[bytes_begin] &= first_byte_mask; // NOLINT - bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); - - if (bytes_end - bytes_begin > 2) { -@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, - n_remaining -= n_full_bytes * 8; - if (n_remaining > 0) { - // Zero out the last byte -- *out_cursor = 0x00; -+ *out_cursor = 0x00; // NOLINT - for (int i = 0; i < n_remaining; i++) { - ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); - } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json deleted file mode 100644 index d529787e7c8..00000000000 --- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json +++ /dev/null @@ -1,18 +0,0 @@ - -{ - "packages" : { - "nanoarrow" : { - "version" : "0.6.0.dev", - "git_url" : "https://github.com/apache/arrow-nanoarrow.git", - "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", - "git_shallow" : false, - "patches" : [ - { - "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", - "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", - "fixed_in" : "" - } - ] - } - } -} diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index c30c3d6f4bd..59011f7b138 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -31,7 +32,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -216,12 +216,12 @@ struct identity_initializer { * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. - * @param aggs A vector of aggregation operations corresponding to the table + * @param aggs A span of aggregation operations corresponding to the table * columns. The aggregations determine the identity value for each column. * @param stream CUDA stream used for device memory operations and kernel launches. */ void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream); } // namespace detail diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index d915c85bf85..3a6ff36c424 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -17,15 +17,14 @@ #include #include #include +#include #include -#include - namespace cudf { namespace detail { void initialize_with_identity(mutable_table_view& table, - std::vector const& aggs, + host_span aggs, rmm::cuda_stream_view stream) { // TODO: Initialize all the columns in a single kernel instead of invoking one diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index e8b29a0e7a8..9c9a4c97bff 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -60,7 +60,7 @@ rmm::device_uvector compute_aggregations( rmm::cuda_stream_view stream) { // flatten the aggs to a table that can be operated on by aggregate_row - auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream); auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index 6025686953e..d2830f7d905 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 00db149c6d9..671ee2ea31f 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 0777b9ffd93..437823a3fea 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ rmm::device_uvector compute_global_memory_aggs( bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector const& agg_kinds, + host_span agg_kinds, SetType& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 209e2b7f20a..7cb3f8f190b 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -24,7 +24,7 @@ template rmm::device_uvector compute_global_memory_aggs const& agg_kinds, + host_span agg_kinds, nullable_global_set_t& global_set, std::vector>& aggregations, cudf::detail::result_cache* sparse_results, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index bc32e306b3f..a835736235c 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, @@ -107,7 +108,7 @@ template void extract_populated_keys( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, global_set_t const& global_set, rmm::device_uvector& populated_keys, @@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table( template cudf::table create_sparse_results_table( cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, nullable_global_set_t const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp index 8155ce852e0..4e2fa81bdb7 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.hpp +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -20,12 +20,11 @@ #include #include #include +#include #include #include -#include - namespace cudf::groupby::detail::hash { /** * @brief Computes and returns a device vector containing all populated keys in @@ -47,7 +46,7 @@ void extract_populated_keys(SetType const& key_set, template cudf::table create_sparse_results_table(cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, - std::vector agg_kinds, + host_span agg_kinds, bool direct_aggregations, GlobalSetType const& global_set, rmm::device_uvector& populated_keys, diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp index b2048a9fbb8..a533f7a6448 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -102,12 +103,15 @@ class groupby_simple_aggregations_collector final }; // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream) { std::vector columns; std::vector> aggs; - std::vector agg_kinds; + auto agg_kinds = cudf::detail::make_empty_host_vector(requests.size(), stream); for (auto const& request : requests) { auto const& agg_v = request.aggregations; diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp index dfad51f27d4..e3c17ca972c 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -26,7 +26,10 @@ namespace cudf::groupby::detail::hash { // flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests); +std::tuple, + std::vector>> +flatten_single_pass_aggs(host_span requests, + rmm::cuda_stream_view stream); } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 37a61c1a22c..b71e20938d6 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -170,7 +170,8 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation c cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + cudf::detail::initialize_with_identity( + var_table_view, host_span(&agg.kind, 1), stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index 86835ea8a67..5082ad01327 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -107,7 +107,10 @@ struct group_scan_functor() if (values.is_empty()) { return result; } auto result_table = mutable_table_view({*result}); - cudf::detail::initialize_with_identity(result_table, {K}, stream); + // Need an address of the aggregation kind to pass to the span + auto const kind = K; + cudf::detail::initialize_with_identity( + result_table, host_span(&kind, 1), stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); auto values_view = column_device_view::create(values, stream); diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp index 89f76237de6..e3fdc177b50 100644 --- a/cpp/tests/streams/replace_test.cpp +++ b/cpp/tests/streams/replace_test.cpp @@ -104,9 +104,9 @@ TEST_F(ReplaceTest, NormalizeNansAndZeros) TEST_F(ReplaceTest, NormalizeNansAndZerosMutable) { - auto nan = std::numeric_limits::quiet_NaN(); - auto input_column = cudf::test::make_type_param_vector({-0.0, 0.0, -nan, nan, nan}); - cudf::test::fixed_width_column_wrapper input(input_column.begin(), input_column.end()); - cudf::mutable_column_view mutable_view = cudf::column(input, cudf::test::get_default_stream()); - cudf::normalize_nans_and_zeros(mutable_view, cudf::test::get_default_stream()); + auto nan = std::numeric_limits::quiet_NaN(); + auto data = cudf::test::make_type_param_vector({-0.0, 0.0, -nan, nan, nan}); + auto input = cudf::test::fixed_width_column_wrapper(data.begin(), data.end()).release(); + auto view = input->mutable_view(); + cudf::normalize_nans_and_zeros(view, cudf::test::get_default_stream()); } diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index bfbfbfed333..410fd57691e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx groupby.pyx interop.pyx scalar.pyx strings_udf.pyx types.pyx - utils.pyx -) +set(cython_sources column.pyx groupby.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -24,8 +22,3 @@ rapids_cython_create_modules( ) target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -target_include_directories(interop PUBLIC "$") - -include(${rapids-cmake-dir}/export/find_package_root.cmake) -include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) -target_link_libraries(interop PUBLIC nanoarrow) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index e18e05cc43e..6b5a7814e48 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -3,7 +3,6 @@ from . import ( groupby, - interop, strings_udf, ) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx deleted file mode 100644 index 1c9d3a01b80..00000000000 --- a/python/cudf/cudf/_lib/interop.pyx +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ListDtype, StructDtype - - -def from_dlpack(object dlpack_capsule): - """ - Converts a DLPack Tensor PyCapsule into a list of columns. - - DLPack Tensor PyCapsule is expected to have the name "dltensor". - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_dlpack(dlpack_capsule) - ) - - -def to_dlpack(list source_columns): - """ - Converts a list of columns into a DLPack Tensor PyCapsule. - - DLPack Tensor PyCapsule will have the name "dltensor". - """ - return pylibcudf.interop.to_dlpack( - pylibcudf.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - ) - - -def gather_metadata(object cols_dtypes): - """ - Generates a ColumnMetadata vector for each column. - - Parameters - ---------- - cols_dtypes : iterable - An iterable of ``(column_name, dtype)`` pairs. - """ - cpp_metadata = [] - if cols_dtypes is not None: - for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) - if isinstance(col_dtype, (ListDtype, StructDtype)): - _set_col_children_metadata(col_dtype, cpp_metadata[idx]) - else: - raise TypeError( - "An iterable of (column_name, dtype) pairs is required to " - "construct column_metadata" - ) - return cpp_metadata - - -def _set_col_children_metadata(dtype, col_meta): - if isinstance(dtype, StructDtype): - for name, value in dtype.fields.items(): - element_metadata = pylibcudf.interop.ColumnMetadata(name) - _set_col_children_metadata(value, element_metadata) - col_meta.children_meta.append(element_metadata) - elif isinstance(dtype, ListDtype): - # Offsets - child 0 - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - # Element column - child 1 - element_metadata = pylibcudf.interop.ColumnMetadata() - _set_col_children_metadata(dtype.element_type, element_metadata) - col_meta.children_meta.append(element_metadata) - else: - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - -@acquire_spill_lock() -def to_arrow(list source_columns, object column_dtypes): - """Convert a list of columns from - cudf Frame to a PyArrow Table. - - Parameters - ---------- - source_columns : a list of columns to convert - column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs - - Returns - ------- - pyarrow table - """ - cpp_metadata = gather_metadata(column_dtypes) - return pylibcudf.interop.to_arrow( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - cpp_metadata, - ) - - -@acquire_spill_lock() -def from_arrow(object input_table): - """Convert from PyArrow Table to a list of columns. - - Parameters - ---------- - input_table : PyArrow table - - Returns - ------- - A list of columns to construct Frame object - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_arrow(input_table) - ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2515157253c..cccafaeba88 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -279,6 +279,7 @@ def dropna(self) -> Self: else: return self.copy() + @acquire_spill_lock() def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -295,9 +296,7 @@ def to_arrow(self) -> pa.Array: 4 ] """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) + return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0) @classmethod def from_arrow(cls, array: pa.Array) -> ColumnBase: @@ -334,26 +333,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: if isinstance(array.type, pa.DictionaryType): indices_table = pa.table( - { - "None": pa.chunked_array( - [chunk.indices for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.indices for chunk in data.column(0).chunks], type=array.type.index_type, ) - } + ], + [None], ) dictionaries_table = pa.table( - { - "None": pa.chunked_array( - [chunk.dictionary for chunk in data["None"].chunks], + [ + pa.chunked_array( + [chunk.dictionary for chunk in data.column(0).chunks], type=array.type.value_type, ) - } + ], + [None], ) - - codes = libcudf.interop.from_arrow(indices_table)[0] - categories = libcudf.interop.from_arrow(dictionaries_table)[0] + with acquire_spill_lock(): + codes = cls.from_pylibcudf( + plc.interop.from_arrow(indices_table).columns()[0] + ) + categories = cls.from_pylibcudf( + plc.interop.from_arrow(dictionaries_table).columns()[0] + ) codes = cudf.core.column.categorical.as_unsigned_codes( - len(categories), codes + len(categories), + codes, # type: ignore[arg-type] ) return cudf.core.column.CategoricalColumn( data=None, @@ -364,10 +370,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: mask=codes.base_mask, children=(codes,), ) - - result = libcudf.interop.from_arrow(data)[0] - - return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + else: + result = cls.from_pylibcudf( + plc.interop.from_arrow(data).columns()[0] + ) + # TODO: cudf_dtype_from_pa_type may be less necessary for some types + return result._with_type_metadata( + cudf_dtype_from_pa_type(array.type) + ) @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1a820da3c62..b6a4122ebb9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1016,7 +1016,7 @@ def to_pandas( self.dtype.tz, ambiguous="NaT", nonexistent="NaT" ) - def to_arrow(self): + def to_arrow(self) -> pa.Array: return pa.compute.assume_timezone( self._local_time.to_arrow(), str(self.dtype.tz) ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9e6a73f1a9c..09941665ba2 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -269,8 +269,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_32 = np.array(self.base_data.memoryview()).view("int32") + def to_arrow(self) -> pa.Array: + data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") # use striding to set the first 32 bits of each 128-bit chunk: @@ -337,7 +337,7 @@ def from_arrow(cls, data: pa.Array): result.dtype.precision = data.type.precision return result - def to_arrow(self): + def to_arrow(self) -> pa.Array: return super().to_arrow().cast(self.dtype.to_arrow()) def _with_type_metadata( @@ -396,8 +396,8 @@ def from_arrow(cls, data: pa.Array): mask=mask, ) - def to_arrow(self): - data_buf_64 = np.array(self.base_data.memoryview()).view("int64") + def to_arrow(self) -> pa.Array: + data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr] data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") # use striding to set the first 64 bits of each 128-bit chunk: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ba98e28f6a2..3d9440cdf21 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -150,7 +150,7 @@ def offsets(self) -> NumericalColumn: """ return cast(NumericalColumn, self.children[0]) - def to_arrow(self): + def to_arrow(self) -> pa.Array: offsets = self.offsets.to_arrow() elements = ( pa.nulls(len(self.elements)) @@ -160,7 +160,7 @@ def to_arrow(self): pa_type = pa.list_(elements.type) if self.nullable: - nbuf = pa.py_buffer(self.mask.memoryview()) + nbuf = pa.py_buffer(self.mask.memoryview()) # type: ignore[union-attr] buffers = (nbuf, offsets.buffers()[1]) else: buffers = offsets.buffers() diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index ba9b15667f1..9aadbf8f47a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -946,16 +946,17 @@ def from_arrow(cls, data: pa.Table) -> Self: if len(dict_indices): dict_indices_table = pa.table(dict_indices) data = data.drop(dict_indices_table.column_names) - indices_columns = libcudf.interop.from_arrow(dict_indices_table) + plc_indices = plc.interop.from_arrow(dict_indices_table) # as dictionary size can vary, it can't be a single table cudf_dictionaries_columns = { name: ColumnBase.from_arrow(dict_dictionaries[name]) for name in dict_dictionaries.keys() } - for name, codes in zip( - dict_indices_table.column_names, indices_columns + for name, plc_codes in zip( + dict_indices_table.column_names, plc_indices.columns() ): + codes = libcudf.column.Column.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] codes = as_unsigned_codes(len(categories), codes) cudf_category_frame[name] = CategoricalColumn( @@ -971,9 +972,9 @@ def from_arrow(cls, data: pa.Table) -> Self: # Handle non-dict arrays cudf_non_category_frame = { - name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) + name: libcudf.column.Column.from_pylibcudf(plc_col) + for name, plc_col in zip( + data.column_names, plc.interop.from_arrow(data).columns() ) } @@ -1032,7 +1033,7 @@ def from_arrow(cls, data: pa.Table) -> Self: return cls._from_data({name: result[name] for name in column_names}) @_performance_tracking - def to_arrow(self): + def to_arrow(self) -> pa.Table: """ Convert to arrow Table diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index fe8e446f9c0..3b3fd5f7c56 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -1,13 +1,14 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations +import pylibcudf as plc import cudf -from cudf._lib import interop as libdlpack from cudf.core.column import ColumnBase from cudf.utils import ioutils -def from_dlpack(pycapsule_obj): +def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame: """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: @@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj): cuDF from_dlpack() assumes column-major (Fortran order) input. If the input tensor is row-major, transpose it before passing it to this function. """ + plc_table = plc.interop.from_dlpack(pycapsule_obj) + data = dict( + enumerate( + (ColumnBase.from_pylibcudf(col) for col in plc_table.columns()) + ) + ) - columns = libdlpack.from_dlpack(pycapsule_obj) - data = dict(enumerate(columns)) - - if len(columns) == 1: + if len(data) == 1: return cudf.Series._from_data(data) else: return cudf.DataFrame._from_data(data) @ioutils.doc_to_dlpack() -def to_dlpack(cudf_obj): +def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex): """Converts a cuDF object to a DLPack tensor. DLPack is an open-source memory tensor structure: @@ -80,13 +84,14 @@ def to_dlpack(cudf_obj): if any( not cudf.api.types._is_non_decimal_numeric_dtype(dtype) - for _, dtype in gdf._dtypes + for _, dtype in gdf._dtypes # type: ignore[union-attr] ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [dtype for _, dtype in gdf._dtypes] + [dtype for _, dtype in gdf._dtypes] # type: ignore[union-attr] ) gdf = gdf.astype(dtype) - - return libdlpack.to_dlpack([*gdf._columns]) + return plc.interop.to_dlpack( + plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns]) + ) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 4f0709ec985..e0c9e535e6f 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -287,21 +287,25 @@ def _plc_write_json( rows_per_chunk: int = 1024 * 64, # 64K rows ) -> None: try: - plc.io.json.write_json( - plc.io.SinkInfo([path_or_buf]), - plc.io.TableWithMetadata( - plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ), - colnames, + tbl_w_meta = plc.io.TableWithMetadata( + plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] ), - na_rep, - include_nulls, - lines, - rows_per_chunk, - true_value="true", - false_value="false", + colnames, ) + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl + ) + .metadata(tbl_w_meta) + .na_rep(na_rep) + .include_nulls(include_nulls) + .lines(lines) + .build() + ) + if rows_per_chunk != np.iinfo(np.int32).max: + options.set_rows_per_chunk(rows_per_chunk) + plc.io.json.write_json(options) except OverflowError as err: raise OverflowError( f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index d7726971351..4894ca3bd6e 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -6,8 +6,13 @@ from pylibcudf.io.types cimport ( TableWithMetadata, compression_type, ) -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t +from pylibcudf.libcudf.io.json cimport ( + json_recovery_mode_t, + json_writer_options, + json_writer_options_builder, +) from pylibcudf.libcudf.types cimport size_type +from pylibcudf.table cimport Table cpdef TableWithMetadata read_json( @@ -24,17 +29,25 @@ cpdef TableWithMetadata read_json( dict extra_parameters = *, ) +cdef class JsonWriterOptions: + cdef json_writer_options c_obj + cdef SinkInfo sink + cdef Table table + cpdef void set_rows_per_chunk(self, size_type val) + cpdef void set_true_value(self, str val) + cpdef void set_false_value(self, str val) -cpdef void write_json( - SinkInfo sink_info, - TableWithMetadata tbl, - str na_rep = *, - bool include_nulls = *, - bool lines = *, - size_type rows_per_chunk = *, - str true_value = *, - str false_value = * -) +cdef class JsonWriterOptionsBuilder: + cdef json_writer_options_builder c_obj + cdef SinkInfo sink + cdef Table table + cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta) + cpdef JsonWriterOptionsBuilder na_rep(self, str val) + cpdef JsonWriterOptionsBuilder include_nulls(self, bool val) + cpdef JsonWriterOptionsBuilder lines(self, bool val) + cpdef JsonWriterOptions build(self) + +cpdef void write_json(JsonWriterOptions options) cpdef tuple chunked_read_json( SourceInfo source_info, diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index b2bc6a43700..e0489742cd0 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -2,6 +2,8 @@ from collections.abc import Mapping from typing import TypeAlias +from typing_extensions import Self + from pylibcudf.column import Column from pylibcudf.io.types import ( CompressionType, @@ -10,6 +12,7 @@ from pylibcudf.io.types import ( SourceInfo, TableWithMetadata, ) +from pylibcudf.table import Table from pylibcudf.types import DataType ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] @@ -28,16 +31,22 @@ def read_json( prune_columns: bool = False, recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, ) -> TableWithMetadata: ... -def write_json( - sink_info: SinkInfo, - table_w_meta: TableWithMetadata, - na_rep: str = "", - include_nulls: bool = False, - lines: bool = False, - rows_per_chunk: int = 2**32 - 1, - true_value: str = "true", - false_value: str = "false", -) -> None: ... + +class JsonWriterOptions: + @staticmethod + def builder(sink: SinkInfo, table: Table) -> JsonWriterOptionsBuilder: ... + def set_rows_per_chunk(self, val: int) -> None: ... + def set_true_value(self, val: str) -> None: ... + def set_false_value(self, val: str) -> None: ... + +class JsonWriterOptionsBuilder: + def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ... + def na_rep(self, val: str) -> Self: ... + def include_nulls(self, val: bool) -> Self: ... + def lines(self, val: bool) -> Self: ... + def build(self) -> JsonWriterOptions: ... + +def write_json(options: JsonWriterOptions) -> None: ... def chunked_read_json( source_info: SourceInfo, dtypes: list[NameAndType] | None = None, diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 32f737fbff4..16078b31566 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.limits cimport numeric_limits from libcpp.map cimport map from libcpp.string cimport string from libcpp.utility cimport move @@ -17,13 +16,18 @@ from pylibcudf.libcudf.io.json cimport ( ) from pylibcudf.libcudf.io.types cimport ( compression_type, - table_metadata, table_with_metadata, ) from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.types cimport DataType -__all__ = ["chunked_read_json", "read_json", "write_json"] +__all__ = [ + "chunked_read_json", + "read_json", + "write_json", + "JsonWriterOptions", + "JsonWriterOptionsBuilder" +] cdef map[string, schema_element] _generate_schema_map(list dtypes): cdef map[string, schema_element] schema_map @@ -294,56 +298,171 @@ cpdef TableWithMetadata read_json( return TableWithMetadata.from_libcudf(c_result) -cpdef void write_json( - SinkInfo sink_info, - TableWithMetadata table_w_meta, - str na_rep = "", - bool include_nulls = False, - bool lines = False, - size_type rows_per_chunk = numeric_limits[size_type].max(), - str true_value = "true", - str false_value = "false" -): +cdef class JsonWriterOptions: """ - Writes a :py:class:`~pylibcudf.table.Table` to JSON format. + The settings to use for ``write_json`` - Parameters - ---------- - sink_info: SinkInfo - The SinkInfo object to write the JSON to. - table_w_meta: TableWithMetadata - The TableWithMetadata object containing the Table to write - na_rep: str, default "" - The string representation for null values. - include_nulls: bool, default False + For details, see :cpp:class:`cudf::io::json_writer_options` + """ + @staticmethod + def builder(SinkInfo sink, Table table): + """ + Create a JsonWriterOptionsBuilder object + + Parameters + ---------- + sink : SinkInfo + The sink used for writer output + table : Table + Table to be written to output + + Returns + ------- + JsonWriterOptionsBuilder + Builder to build JsonWriterOptions + """ + cdef JsonWriterOptionsBuilder json_builder = ( + JsonWriterOptionsBuilder.__new__(JsonWriterOptionsBuilder) + ) + json_builder.c_obj = json_writer_options.builder(sink.c_obj, table.view()) + json_builder.sink = sink + json_builder.table = table + return json_builder + + cpdef void set_rows_per_chunk(self, size_type val): + """ + Sets string to used for null entries. + + Parameters + ---------- + val : size_type + String to represent null value + + Returns + ------- + None + """ + self.c_obj.set_rows_per_chunk(val) + + cpdef void set_true_value(self, str val): + """ + Sets string used for values != 0 + + Parameters + ---------- + val : str + String to represent values != 0 + + Returns + ------- + None + """ + self.c_obj.set_true_value(val.encode()) + + cpdef void set_false_value(self, str val): + """ + Sets string used for values == 0 + + Parameters + ---------- + val : str + String to represent values == 0 + + Returns + ------- + None + """ + self.c_obj.set_false_value(val.encode()) + + +cdef class JsonWriterOptionsBuilder: + cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta): + """ + Sets optional metadata (with column names). + + Parameters + ---------- + tbl_w_meta : TableWithMetadata + Associated metadata + + Returns + ------- + Self + """ + self.c_obj.metadata(tbl_w_meta.metadata) + return self + + cpdef JsonWriterOptionsBuilder na_rep(self, str val): + """ + Sets string to used for null entries. + + Parameters + ---------- + val : str + String to represent null value + + Returns + ------- + Self + """ + self.c_obj.na_rep(val.encode()) + return self + + cpdef JsonWriterOptionsBuilder include_nulls(self, bool val): + """ Enables/Disables output of nulls as 'null'. - lines: bool, default False - If `True`, write output in the JSON lines format. - rows_per_chunk: size_type, defaults to length of the input table - The maximum number of rows to write at a time. - true_value: str, default "true" - The string representation for values != 0 in INT8 types. - false_value: str, default "false" - The string representation for values == 0 in INT8 types. + + Parameters + ---------- + val : bool + Boolean value to enable/disable + + Returns + ------- + Self + """ + self.c_obj.include_nulls(val) + return self + + cpdef JsonWriterOptionsBuilder lines(self, bool val): + """ + Enables/Disables JSON lines for records format. + + Parameters + ---------- + val : bool + Boolean value to enable/disable + + Returns + ------- + Self + """ + self.c_obj.lines(val) + return self + + cpdef JsonWriterOptions build(self): + """Create a JsonWriterOptions object""" + cdef JsonWriterOptions json_options = JsonWriterOptions.__new__( + JsonWriterOptions + ) + json_options.c_obj = move(self.c_obj.build()) + json_options.sink = self.sink + json_options.table = self.table + return json_options + + +cpdef void write_json(JsonWriterOptions options): """ - cdef table_metadata tbl_meta = table_w_meta.metadata - cdef string na_rep_c = na_rep.encode() - - cdef json_writer_options options = ( - json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view()) - .metadata(tbl_meta) - .na_rep(na_rep_c) - .include_nulls(include_nulls) - .lines(lines) - .build() - ) + Writes a set of columns to JSON format. - if rows_per_chunk != numeric_limits[size_type].max(): - options.set_rows_per_chunk(rows_per_chunk) - if true_value != "true": - options.set_true_value(true_value.encode()) - if false_value != "false": - options.set_false_value(false_value.encode()) + Parameters + ---------- + options : JsonWriterOptions + Settings for controlling writing behavior + Returns + ------- + None + """ with nogil: - cpp_write_json(options) + cpp_write_json(options.c_obj) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 453e5ce32a8..9b0c5a29fe8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -24,13 +24,19 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk): plc_table_w_meta, pa_table = table_data sink = source_or_sink - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_table_w_meta, - lines=lines, - rows_per_chunk=rows_per_chunk, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_table_w_meta.tbl + ) + .metadata(plc_table_w_meta) + .lines(lines) + .build() ) + options.set_rows_per_chunk(rows_per_chunk) + + plc.io.json.write_json(options) + exp = pa_table.to_pandas() # Convert everything to string to make @@ -57,13 +63,18 @@ def test_write_json_nulls(na_rep, include_nulls): sink = io.StringIO() - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_tbl_w_meta, - na_rep=na_rep, - include_nulls=include_nulls, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .metadata(plc_tbl_w_meta) + .na_rep(na_rep) + .include_nulls(include_nulls) + .build() ) + plc.io.json.write_json(options) + exp = pa_tbl.to_pandas() # Convert everything to string to make @@ -100,15 +111,21 @@ def test_write_json_bool_opts(true_value, false_value): sink = io.StringIO() - plc.io.json.write_json( - plc.io.SinkInfo([sink]), - plc_tbl_w_meta, - include_nulls=True, - na_rep="null", - true_value=true_value, - false_value=false_value, + options = ( + plc.io.json.JsonWriterOptions.builder( + plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl + ) + .metadata(plc_tbl_w_meta) + .na_rep("null") + .include_nulls(True) + .build() ) + options.set_true_value(true_value) + options.set_false_value(false_value) + + plc.io.json.write_json(options) + exp = pa_tbl.to_pandas() # Convert everything to string to make