diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fb7182f4133..65aebfb7f8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9d79733703c..e955b8f1f80 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -186,7 +186,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -207,7 +207,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -217,7 +217,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 858352f515d..dc82c17022a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -94,7 +94,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -106,7 +106,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index af49942c8cd..d80e4fef0d0 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -1,11 +1,13 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -euo pipefail package_name="libcudf" package_dir="python/libcudf" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + rapids-logger "Generating build requirements" rapids-dependency-file-generator \ @@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index db86721755d..3c6dba72164 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -24,8 +24,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf (dask-expr)" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf" +./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -rapids-logger "pytest dask_cudf (legacy)" -DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . - rapids-logger "pytest cudf_kafka" ./ci/run_cudf_kafka_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index e15949f4bdb..44f430ce98d 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -eou pipefail @@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf (dask-expr)" +rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ +python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ . popd - -# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) -rapids-logger "pytest dask_cudf (legacy)" -pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a4b3f4fe174..a8e5018b283 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 7173c955116..6dc99b14f5d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index b6c03dc1bc2..7a0005497df 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.15 + - polars >=1.11,<1.18 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2c16deeed82..b34496cc256 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13,<0.0.18 + - numba-cuda >=0.2.0,<0.3.0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cb814aa8c0f..9dabe4e8800 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) rapids_cpm_rapids_logger() -rapids_make_logger(cudf EXPORT_SET cudf-exports) +rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN) # find jitify include(cmake/thirdparty/get_jitify.cmake) @@ -461,6 +461,7 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu + src/hash/xxhash_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/arrow_utilities.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 749e1b628ee..0ff712c1c77 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) # --------------------------------------------------------------------------------- ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) +# ################################################################################################## +# * rolling benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp new file mode 100644 index 00000000000..04afe5ac661 --- /dev/null +++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + return cudf::sort(cudf::table_view{{keys->view()}}); + }(); + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = cudf::grouped_rolling_window( + keys->view(), vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_grouped_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 28}) + .add_int64_axis("preceding_size", {1, 10}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1}) + .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000}); diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp new file mode 100644 index 00000000000..af9ecd6a26f --- /dev/null +++ b/cpp/benchmarks/rolling/rolling_sum.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +template +void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + + auto vals = [&]() { + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto preceding = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) { + return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto following = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) { + return std::max(-i - 1, std::min(following_size, num_rows - i - 1)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_fixed_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {1, 10, 100}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1, 20}); + +NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_variable_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {10, 100}) + .add_int64_axis("following_size", {2}); diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 307a52cd242..88034b4f804 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,26 @@ std::unique_ptr sha512( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the XXHash_32 hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index 7cb80081a95..f796ff4526e 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,6 +61,11 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr xxhash_32(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::device_async_resource_ref mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh new file mode 100644 index 00000000000..bb6e7f18fbc --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_32 { + using result_type = std::uint32_t; + + CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } + + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const + { + return this->_impl.compute_hash(bytes, size); + } + + private: + template + __device__ constexpr result_type compute(T const& key) const + { + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + cuco::xxhash_32 _impl; +}; + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(bool const& key) const +{ + return this->compute(static_cast(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(float const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + double const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::string_view const& key) const +{ + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal128 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index 42f84e4d0c7..e111367d191 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,8 +1,9 @@ # -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # import argparse import os +import re import sys import xml.etree.ElementTree as ET from pathlib import Path @@ -144,6 +145,16 @@ def format_file_size(input_size): return file_size_str +def replace_placeholder_patterns(input_string: str) -> str: + pattern = r'(_h_env_placehold)[_placehold]+' + return re.sub(pattern, r'\1...', input_string) + + +# adjust name for display +def format_file_name(name: str) -> str: + return replace_placeholder_patterns(name) + + # Output chart results in HTML format # Builds a standalone html file with no javascript or styles def output_html(entries, sorted_list, cmp_entries, args): @@ -223,7 +234,8 @@ def output_html(entries, sorted_list, cmp_entries, args): print("", end="") # use a slightly smaller, fixed-width font @@ -265,7 +277,8 @@ def output_html(entries, sorted_list, cmp_entries, args): file_size_str = format_file_size(file_size) # output entry row - print("", name, "", sep="", end="") + display_name = format_file_name(name) + print("", display_name, "", sep="", end="") print("", build_time_str, "", sep="", end="") print("", file_size_str, "", sep="", end="") # output diff column diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu new file mode 100644 index 00000000000..40503f7f911 --- /dev/null +++ b/cpp/src/hash/xxhash_32.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(column_device_view const& col, + size_type const row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } + auto const hasher = XXHash_32{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(column_device_view const&, + size_type const, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for XXHash_32"); + } + }; + + Nullate const _check_nulls; + table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column(data_type(type_to_id()), + input.num_rows(), + mask_state::UNALLOCATED, + stream, + mr); + + // Return early if there's nothing to hash + if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } + + bool const nullable = has_nulls(input); + auto const input_view = table_device_view::create(input, stream); + auto output_view = output->mutable_view(); + + // Compute the hash value for each row + thrust::tabulate(rmm::exec_policy(stream), + output_view.begin(), + output_view.end(), + device_row_hasher(nullable, *input_view, seed)); + + return output; +} + +} // namespace detail + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::xxhash_32(input, seed, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 82d8152ca1c..113342e9cbf 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -37,12 +38,25 @@ #include #include +#include +#include + #include namespace cudf::io::json::detail { namespace { +namespace pools { + +BS::thread_pool& tpool() +{ + static BS::thread_pool _tpool(std::thread::hardware_concurrency()); + return _tpool; +} + +} // namespace pools + class compressed_host_buffer_source final : public datasource { public: explicit compressed_host_buffer_source(std::unique_ptr const& src, @@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource { { auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), _dbuf_ptr->size()); - if (comptype == compression_type::GZIP || comptype == compression_type::ZIP || - comptype == compression_type::SNAPPY) { + if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP || + _comptype == compression_type::SNAPPY) { _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer); } else { _decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer); @@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource { return std::make_unique(_decompressed_buffer.data() + offset, count); } - [[nodiscard]] bool supports_device_read() const override { return false; } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + auto& thread_pool = pools::tpool(); + return thread_pool.submit_task([this, offset, size, dst, stream] { + auto hbuf = host_read(offset, size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + return hbuf->size(); + }); + } + + [[nodiscard]] bool supports_device_read() const override { return true; } [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; } @@ -431,6 +460,8 @@ device_span ingest_raw_input(device_span buffer, // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line // delimiter. auto constexpr num_delimiter_chars = 1; + std::vector> thread_tasks; + auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count()); auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); @@ -447,13 +478,17 @@ device_span ingest_raw_input(device_span buffer, auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { + for (std::size_t i = start_source, cur_stream = 0; + i < sources.size() && bytes_read < total_bytes_to_read; + i++) { if (sources[i]->is_empty()) continue; auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); auto destination = reinterpret_cast(buffer.data()) + bytes_read + (num_delimiter_chars * delimiter_map.size()); - if (sources[i]->is_device_read_preferred(data_size)) { - bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); + if (sources[i]->supports_device_read()) { + thread_tasks.emplace_back(sources[i]->device_read_async( + range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()])); + bytes_read += data_size; } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); auto const& h_buffer = h_buffers.back(); @@ -481,6 +516,15 @@ device_span ingest_raw_input(device_span buffer, buffer.data()); } stream.synchronize(); + + if (thread_tasks.size()) { + auto const bytes_read = std::accumulate( + thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) { + return sum + task.get(); + }); + CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy"); + } + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); } @@ -505,10 +549,17 @@ table_with_metadata read_json(host_span> sources, return read_json_impl(sources, reader_opts, stream, mr); std::vector> compressed_sources; - for (size_t i = 0; i < sources.size(); i++) { - compressed_sources.emplace_back( - std::make_unique(sources[i], reader_opts.get_compression())); + std::vector>> thread_tasks; + auto& thread_pool = pools::tpool(); + for (auto& src : sources) { + thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] { + return std::make_unique(src, reader_opts.get_compression()); + })); } + std::transform(thread_tasks.begin(), + thread_tasks.end(), + std::back_inserter(compressed_sources), + [](auto& task) { return task.get(); }); // in read_json_impl, we need the compressed source size to actually be the // uncompressed source size for correct batching return read_json_impl(compressed_sources, reader_opts, stream, mr); diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 7facc6497ed..469f933f918 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1572b7246c0..1f84d1f81dc 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,6 +132,177 @@ struct orcdec_state_s { } vals; }; +/** + * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group. + * + * This class is used to address a special case, where the first run of the DATA stream spans two + * adjacent row groups and its length is greater than the maximum length allowed to be consumed. + * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be + * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type + * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a + * local variable and does not reside in the shared memory. + */ +class run_cache_manager { + private: + enum class status : uint8_t { + DISABLED, ///< Run cache manager is disabled. No caching will be performed. If the special case + ///< happens, the run cache manager will be set to this status after the cache read + ///< is completed. This status also applies when the special case does not happen. + CAN_WRITE_TO_CACHE, ///< Run cache manager is ready for write. If the special case happens, the + ///< run cache manager will be set to this status. + CAN_READ_FROM_CACHE, ///< Run cache manager is ready for read. If the special case happens, the + ///< run cache manager will be set to this status after the cache write is + ///< completed. + }; + + public: + /** + * @brief Initialize the run cache manager. + * + * @param[in] s ORC decoder state. + */ + __device__ void initialize(orcdec_state_s* s) + { + _status = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP) + ? status::CAN_WRITE_TO_CACHE + : status::DISABLED; + _reusable_length = 0; + _run_length = 0; + } + + private: + status _status; ///< The status of the run cache manager. + uint32_t + _reusable_length; ///< The number of data to be cached and reused later. For example, if a run + ///< has a length of 512 but the maximum length allowed to be consumed is + ///< capped at 162, then 350 (512-162) data will be cached. + uint32_t _run_length; ///< The length of the run, 512 in the above example. + friend class cache_helper; +}; + +/** + * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for + * a row group. + * + * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is + * in the local storage (as an optimization). If a function is to use run_cache_manager, both the + * manager and the cache objects need to be passed. This class is introduced to simplify the + * function call, so that only a single cache_helper object needs to be passed. To that end, public + * methods originally belonging to run_cache_manager have been moved to this class. + */ +class cache_helper { + public: + /** + * @brief Constructor. + * + * @param[in] run_cache_manager_inst An instance of run_cache_manager. + */ + __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst) + : _manager(run_cache_manager_inst) + { + } + + /** + * @brief Set the reusable length object. + * + * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the + * DATA stream. + * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed + * by the decoder when processing the SECONDARY stream. + */ + __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length) + { + if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) { + _manager._run_length = run_length; + _manager._reusable_length = + (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0; + } + } + + /** + * @brief Adjust the maximum length allowed to be consumed when the length of the first run is + * greater than it. + * + * @param[in] max_length The maximum length allowed to be consumed for the DATA stream. + * @return A new maximum length. + */ + [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length) + { + auto new_max_length{max_length}; + if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) { + new_max_length -= _manager._reusable_length; + } + return new_max_length; + } + + /** + * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache. + * + * @param[in] src Intermediate buffer for the DATA stream. + */ + __device__ void write_to_cache(int64_t* src) + { + if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; } + + auto const tid = threadIdx.x; + + __syncthreads(); + + // All threads in the block always take a uniform code path for the following branches. + // _reusable_length ranges between [0, 512]. + if (_manager._reusable_length > 0) { + auto const length_to_skip = _manager._run_length - _manager._reusable_length; + if (tid < _manager._reusable_length) { + auto const src_idx = tid + length_to_skip; + _storage = src[src_idx]; + } + if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; } + } else { + if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; } + } + + __syncthreads(); + } + + /** + * @brief Copy the cached data to the intermediate buffer for the DATA stream. + * + * @param[in,out] dst Intermediate buffer for the DATA stream. + * @param[in,out] rle Run length decoder state object. + */ + __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle) + { + if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; } + + auto const tid = threadIdx.x; + + // First, shift the data up + auto const dst_idx = tid + _manager._reusable_length; + auto const v = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0; + __syncthreads(); + + if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; } + __syncthreads(); + + // Second, insert the cached data + if (tid < _manager._reusable_length) { dst[tid] = _storage; } + __syncthreads(); + + if (tid == 0) { + // Disable the run cache manager, since cache write-and-read happens at most once per row + // group. + _manager._status = run_cache_manager::status::DISABLED; + rle->num_vals += _manager._reusable_length; + } + + __syncthreads(); + } + + private: + run_cache_manager& _manager; ///< An instance of run_cache_manager. + int64_t _storage; ///< Per-thread cache storage. +}; + /** * @brief Initializes byte stream, modifying length and start position to keep the read pointer * 8-byte aligned. @@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { * @param[in] maxvals maximum number of values to decode * @param[in] t thread id * @param[in] has_buffered_values If true, means there are already buffered values + * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage + * caching of the first run of the DATA stream. * * @return number of values decoded */ @@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, T* vals, uint32_t maxvals, int t, - bool has_buffered_values = false) + bool has_buffered_values = false, + cache_helper* cache_helper_inst = nullptr) { if (t == 0) { + if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); } uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; auto numvals = 0; @@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, l += deltapos; } } + + if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); } + if ((numvals != 0) and (numvals + n > maxvals)) break; // case where there are buffered values and can't consume a whole chunk // from decoded values, so skip adding any more to buffer, work on buffered values and then @@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, __syncwarp(); } __syncthreads(); + // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the + // data type is int64_t. + if constexpr (cuda::std::is_same_v) { + if (cache_helper_inst != nullptr) { + // Run cache is read from during the 2nd iteration of the top-level while loop in + // gpuDecodeOrcColumnData(). + cache_helper_inst->read_from_cache(vals, rle); + // Run cache is written to during the 1st iteration of the loop. + cache_helper_inst->write_to_cache(vals); + } + } return rle->num_vals; } @@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Struct doesn't have any data in itself, so skip bool const is_valid = s->chunk.type_kind != STRUCT; size_t const max_num_rows = s->chunk.column_num_rows; + __shared__ run_cache_manager run_cache_manager_inst; + cache_helper cache_helper_inst(run_cache_manager_inst); if (t == 0 and is_valid) { // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { @@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); + + run_cache_manager_inst.initialize(s); } __syncthreads(); @@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (is_rlev1(s->chunk.encoding_kind)) { numvals = Integer_RLEv1(bs, &s->u.rlev1, s->vals.i64, numvals, t); } else { - numvals = Integer_RLEv2(bs, &s->u.rlev2, s->vals.i64, numvals, t); + numvals = Integer_RLEv2(bs, + &s->u.rlev2, + s->vals.i64, + numvals, + t, + false /**has_buffered_values */, + &cache_helper_inst); } if (s->chunk.type_kind == DECIMAL) { // If we're using an index, we may have to drop values from the initial run diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index b5f9b894c46..0d40a1f7b1b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 9acbe026bb2..32bb3349666 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -961,9 +961,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) return; } - // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. - if (s->num_rows == 0) { return; } - using value_decoder_type = std::conditional_t< split_decode_t, decode_fixed_width_split_values_func, diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c48ff896e33..f9fcca6bb4f 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } - // Compute column string sizes (using page string offsets) for this subpass + // Compute column string sizes (using page string offsets) for this output table chunk col_string_sizes = calculate_page_string_offsets(); - // ensure cumulative column string sizes have been initialized - if (pass.cumulative_col_string_sizes.empty()) { - pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); - } - - // Add to the cumulative column string sizes of this pass - std::transform(pass.cumulative_col_string_sizes.begin(), - pass.cumulative_col_string_sizes.end(), - col_string_sizes.begin(), - pass.cumulative_col_string_sizes.begin(), - std::plus<>{}); - // Check for overflow in cumulative column string sizes of this pass so that the page string // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), - pass.cumulative_col_string_sizes.cend(), + auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), + col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // Mark any chunks for which the cumulative column string size has exceeded the - // large strings threshold - if (has_large_strings) { - for (auto& chunk : pass.chunks) { - auto const idx = chunk.src_col_index; - if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } - } + // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output + // column chunks and the large strings threshold. + for (auto& chunk : pass.chunks) { + auto const idx = chunk.src_col_index; + chunk.is_large_string_col = (col_string_sizes[idx] > threshold); } } @@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data( - col_string_sizes[pass.chunks[c].src_col_index], - pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > - static_cast(strings::detail::get_offset64_threshold()), - _stream); + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], + pass.chunks[c].is_large_string_col, + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { - // need to cap off the string offsets column - auto const sz = static_cast(col_string_sizes[idx]); - if (sz <= strings::detail::get_offset64_threshold()) { + // only if it is not a large strings column + if (col_string_sizes[idx] <= + static_cast(strings::detail::get_offset64_threshold())) { out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); - final_offsets.emplace_back(sz); + final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } } } diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index ca46f198bb8..4a773fbced1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,9 +130,6 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; - // cumulative strings column sizes. - std::vector cumulative_col_string_sizes{}; - int level_type_size{0}; // skip_rows / num_rows for this pass. diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 4f75908fe72..37c5698f654 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 368b1fba870..4565626edad 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,8 +37,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index a4ec97af235..4c063b6202e 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,8 +30,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join_semi(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 98170ed719a..869d05ce4d3 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,8 +34,6 @@ namespace cudf { namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) compute_mixed_join_output_size(table_device_view left_table, diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 73c4567d3a4..94d27d976c3 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/getenv_or.hpp" + #include #include #include @@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) CUDF_EXPORT auto& kernel_pinned_copy_threshold() { // use cudaMemcpyAsync for all pinned copies - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0); return threshold; } @@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0); return threshold; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 46ae5273853..35877ac34b9 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -192,6 +192,7 @@ ConfigureTest( hashing/sha256_test.cpp hashing/sha384_test.cpp hashing/sha512_test.cpp + hashing/xxhash_32_test.cpp hashing/xxhash_64_test.cpp ) diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp new file mode 100644 index 00000000000..9e3c66b0d0b --- /dev/null +++ b/cpp/tests/hashing/xxhash_32_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +class XXHash_32_Test : public cudf::test::BaseFixture {}; + +TEST_F(XXHash_32_Test, TestInteger) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{0, 42, 825}}; + auto constexpr seed = 0u; + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({148298089u, 1161967057u, 1066694813u}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, TestDouble) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{-8., 25., 90.}}; + auto constexpr seed = 42u; + + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({2276435783u, 3120212431u, 3454197470u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, StringType) +{ + auto col1 = cudf::test::strings_column_wrapper({"I", "am", "AI"}); + auto constexpr seed = 825u; + + auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({320624298u, 1612654309u, 1409499009u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index 58396115a54..b5d20325b75 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel) cudf::default_logger().warn("warn"); cudf::default_logger().error("error"); cudf::default_logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); + ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) diff --git a/dependencies.yaml b/dependencies.yaml index b0f217a6770..4672a355c72 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -688,7 +688,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 + - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0 - nvtx>=0.2.1 - packaging - rich @@ -747,7 +747,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.15 + - polars>=1.11,<1.18 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] @@ -810,11 +810,11 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - *numba-cuda-dep + - numba-cuda==0.2.0 - pandas==2.0.* - matrix: {dependencies: "latest"} packages: - - numba-cuda==0.0.15 + - *numba-cuda-dep - pandas==2.2.3 - matrix: packages: diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index 53af52eff07..5e544e92a77 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f * @param filePath Full path of the input Parquet file to read. */ public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) { - handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - + long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, HostMemoryBuffer buffer, long offset, long len) { - handle = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, - buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; + if (handle == 0) { + throw new IllegalStateException("Cannot create native chunked Parquet reader object."); + } + multiHostBufferSourceHandle = handles[1]; + } + /** + * Construct the reader instance from a read limit and data in host memory buffers. + * + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or + * 0 if there is no limit + * @param opts The options for Parquet reading. + * @param buffers Array of buffers containing the file data. The buffers are logically + * concatenated to construct the file being read. + */ + public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, + ParquetOptions opts, HostMemoryBuffer... buffers) { + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -181,6 +211,10 @@ public void close() { DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); dataSourceHandle = 0; } + if (multiHostBufferSourceHandle != 0) { + destroyMultiHostBufferSource(multiHostBufferSourceHandle); + multiHostBufferSourceHandle = 0; + } } @@ -196,6 +230,8 @@ public void close() { private long dataSourceHandle = 0; + private long multiHostBufferSourceHandle = 0; + /** * Create a native chunked Parquet reader object on heap and return its memory address. * @@ -206,13 +242,12 @@ public void close() { * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all. * @param binaryToString Whether to convert the corresponding column to String if it is binary. * @param filePath Full path of the file to read, or given as null if reading from a buffer. - * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer. - * @param length The length of the buffer to read from. + * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers. * @param timeUnit Return type of time unit for timestamps. */ - private static native long create(long chunkSizeByteLimit, long passReadLimit, - String[] filterColumnNames, boolean[] binaryToString, - String filePath, long bufferAddrs, long length, int timeUnit); + private static native long[] create(long chunkSizeByteLimit, long passReadLimit, + String[] filterColumnNames, boolean[] binaryToString, + String filePath, long[] bufferAddrsSizes, int timeUnit); private static native long createWithDataSource(long chunkedSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); @@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit, private static native long[] readChunk(long handle); private static native void close(long handle); + + private static native void destroyMultiHostBufferSource(long handle); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b01ce31b1f3..298f2cff6f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length, * all of them * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. - * @param address the address of the buffer to read from or 0 if we should not. - * @param length the length of the buffer to read from. + * @param addrsAndSizes the address and size pairs for every buffer or null for no buffers. * @param timeUnit return type of TimeStamp in units */ private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, - long address, long length, int timeUnit) throws CudfException; + long[] addrsAndSizes, int timeUnit) throws CudfException; private static native long[] readParquetFromDataSource(String[] filterColumnNames, boolean[] binaryToString, int timeUnit, @@ -1357,7 +1356,7 @@ public static Table readParquet(File path) { */ public static Table readParquet(ParquetOptions opts, File path) { return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); + path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId())); } /** @@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, } } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffer raw parquet formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) { return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } @@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); + } + + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated + * in order to construct the file being read. + * @return the data parsed as a table on the GPU. + */ + public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) { + assert buffers.length > 0; + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param ds custom datasource to provide the Parquet file data + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, DataSource ds) { long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); try { diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9ff43feeac6..bd1714aa476 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -156,8 +156,9 @@ add_library( src/ScalarJni.cpp src/TableJni.cpp src/aggregation128_utils.cu - src/maps_column_view.cu src/check_nvcomp_output_sizes.cu + src/maps_column_view.cu + src/multi_host_buffer_source.cpp ) # Disable NVTX if necessary diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp new file mode 100644 index 00000000000..2aedb2321e4 --- /dev/null +++ b/java/src/main/native/include/multi_host_buffer_source.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "jni_utils.hpp" + +#include + +#include + +namespace cudf { +namespace jni { + +/** + * @brief A custom datasource providing data from an array of host memory buffers. + */ +class multi_host_buffer_source : public cudf::io::datasource { + std::vector addrs_; + std::vector offsets_; + + size_t locate_offset_index(size_t offset); + + public: + explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes); + std::unique_ptr host_read(size_t offset, size_t size) override; + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; + bool supports_device_read() const override { return true; } + bool is_device_read_preferred(size_t size) const override { return true; } + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override; + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + size_t size() const override { return offsets_.back(); } +}; + +} // namespace jni +} // namespace cudf diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index cf04a87262f..4967e0b2b04 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -36,7 +37,7 @@ extern "C" { // This function should take all the parameters that `Table.readParquet` takes, // plus one more parameter `long chunkSizeByteLimit`. -JNIEXPORT jlong JNICALL +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jclass, jlong chunk_read_limit, @@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path, - jlong buffer, - jlong buffer_length, + jlongArray addrs_sizes, jint unit) { - JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr); bool read_buffer = true; - if (buffer == 0) { - JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0); + if (addrs_sizes == nullptr) { + JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr); read_buffer = false; } else if (inp_file_path != nullptr) { - JNI_THROW_NEW( - env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0); + JNI_THROW_NEW(env, + cudf::jni::ILLEGAL_ARG_CLASS, + "Cannot pass in both buffers and an inp_file_path", + nullptr); } try { cudf::jni::auto_set_device(env); cudf::jni::native_jstring filename(env, inp_file_path); if (!read_buffer && filename.is_empty()) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0); + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr); } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); @@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); (void)n_col_binary_read; - auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto opts_builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, auto const read_opts = opts_builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - - return reinterpret_cast( + n_addrs_sizes.cancel(); + n_col_binary_read.cancel(); + auto reader_handle = reinterpret_cast( new cudf::io::chunked_parquet_reader(static_cast(chunk_read_limit), static_cast(pass_read_limit), read_opts)); + cudf::jni::native_jlongArray result(env, 2); + result[0] = reader_handle; + result[1] = cudf::jni::release_as_jlong(multi_buffer_source); + return result.get_jArray(); } - CATCH_STD(env, 0); + CATCH_STD(env, nullptr); } JNIEXPORT jlong JNICALL @@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en CATCH_STD(env, ); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource( + JNIEnv* env, jclass, jlong handle) +{ + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + // // Chunked ORC reader JNI // diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ed35f35794d..a6c7ae9ba18 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -19,6 +19,7 @@ #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" #include "jni_writer_data_sink.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, - jlong buffer, - jlong buffer_length, + jlongArray addrs_and_sizes, jint unit) { JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; - if (buffer == 0) { + if (addrs_and_sizes == nullptr) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); read_buffer = false; } else if (inputfilepath != NULL) { JNI_THROW_NEW( env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL); } try { @@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); - - auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + auto tbl = cudf::io::read_parquet(opts).tbl; + n_col_binary_read.cancel(); + n_addrs_sizes.cancel(); + return convert_table_for_return(env, tbl); } CATCH_STD(env, NULL); } diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp new file mode 100644 index 00000000000..c577fc680ba --- /dev/null +++ b/java/src/main/native/src/multi_host_buffer_source.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "multi_host_buffer_source.hpp" + +#include +#include +#include +#include + +namespace cudf { +namespace jni { + +multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes) +{ + if (addrs_sizes.size() % 2 != 0) { + throw std::logic_error("addrs_sizes length not a multiple of 2"); + } + auto count = addrs_sizes.size() / 2; + addrs_.reserve(count); + offsets_.reserve(count + 1); + size_t total_size = 0; + for (int i = 0; i < addrs_sizes.size(); i += 2) { + addrs_.push_back(reinterpret_cast(addrs_sizes[i])); + offsets_.push_back(total_size); + total_size += addrs_sizes[i + 1]; + } + offsets_.push_back(total_size); +} + +size_t multi_host_buffer_source::locate_offset_index(size_t offset) +{ + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto start = offsets_.begin(); + auto it = std::upper_bound(start, offsets_.end(), offset); + return (it - start) - 1; +} + +std::unique_ptr multi_host_buffer_source::host_read(size_t offset, + size_t size) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto const end_offset = offset + size; + if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto next_offset = offsets_[buffer_index + 1]; + if (end_offset <= next_offset) { + // read range hits only a single buffer, so return a zero-copy view of the data + auto src = addrs_[buffer_index] + offset - offsets_[buffer_index]; + return std::make_unique(src, size); + } + auto buf = std::vector(size); + auto bytes_read = host_read(offset, size, buf.data()); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected host read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>>(std::move(buf)); +} + +size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + std::memcpy(dst, src, copy_size); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::unique_ptr multi_host_buffer_source::device_read( + size_t offset, size_t size, rmm::cuda_stream_view stream) +{ + rmm::device_buffer buf(size, stream); + auto dst = static_cast(buf.data()); + auto bytes_read = device_read(offset, size, dst, stream); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected device read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>(std::move(buf)); +} + +size_t multi_host_buffer_source::device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value())); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::future multi_host_buffer_source::device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + std::promise p; + p.set_value(device_read(offset, size, dst, stream)); + return p.get_future(); +} + +} // namespace jni +} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index c7fcb1756b6..7eb32892bad 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,11 @@ import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.StandardOpenOption; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -1714,6 +1717,42 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetHostBuffers() throws Exception { + long size = TEST_PARQUET_FILE_CHUNKED_READ.length(); + java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath(); + try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2); + HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) { + try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) { + ByteBuffer bb1 = buf1.asByteBuffer(); + while (bb1.hasRemaining()) { + if (channel.read(bb1) == -1) { + throw new EOFException("error reading first buffer"); + } + } + ByteBuffer bb2 = buf2.asByteBuffer(); + while (bb2.hasRemaining()) { + if (channel.read(bb2) == -1) { + throw new EOFException("error reading second buffer"); + } + } + } + ParquetOptions opts = ParquetOptions.DEFAULT; + try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + } + @Test void testChunkedReadParquetFromDataSource() throws IOException { try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ff6fba1c3e8..ec44a6aa8c5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8b1d16f0d85..026c12895e8 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +cdef dtype_from_lists_column_view(column_view cv) +cdef dtype_from_column_view(column_view cv) cdef class Column: cdef public: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f7dcd89ea48..c59bbc0f40c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -19,24 +19,21 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype +from cudf.utils.dtypes import ( + _get_base_dtype, + dtype_to_pylibcudf_type, + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, +) from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr +from libc.stdint cimport uintptr_t, int32_t +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_pylibcudf_type, -) - -from cudf._lib.types import dtype_from_pylibcudf_column - -from pylibcudf cimport DataType as plc_DataType +from pylibcudf cimport DataType as plc_DataType, Column as plc_Column cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index): ) +def dtype_from_pylibcudf_column(plc_Column col not None): + type_ = col.type() + tid = type_.id() + + if tid == pylibcudf.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == pylibcudf.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == pylibcudf.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + +cdef dtype_from_lists_column_view(column_view cv): + # lists_column_view have no default constructor, so we heap + # allocate it to get around Cython's limitation of requiring + # default constructors for stack allocated objects + cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) + cdef column_view child = lv.get()[0].child() + + if child.type().id() == libcudf_types.type_id.LIST: + return cudf.ListDtype(dtype_from_lists_column_view(child)) + else: + return cudf.ListDtype(dtype_from_column_view(child)) + + +cdef dtype_from_column_view(column_view cv): + cdef libcudf_types.type_id tid = cv.type().id() + if tid == libcudf_types.type_id.LIST: + return dtype_from_lists_column_view(cv) + elif tid == libcudf_types.type_id.STRUCT: + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) + elif tid == libcudf_types.type_id.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] + + cdef class Column: """ A Column stores columnar data in device memory. @@ -361,7 +433,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -424,7 +496,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 40bd50acf16..65607c91302 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import copy @@ -14,17 +14,16 @@ import pylibcudf as plc import cudf from cudf.core.dtypes import ListDtype, StructDtype -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar +from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -223,63 +222,22 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef plc_TypeID cdtype_id = self.c_value.type().id() + cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype elif cdtype_id in { - plc_TypeID.DECIMAL32, - plc_TypeID.DECIMAL64, - plc_TypeID.DECIMAL128, + plc.TypeID.DECIMAL32, + plc.TypeID.DECIMAL64, + plc.TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype_id == plc_TypeID.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype_id == plc_TypeID.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == plc_TypeID.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) - else: - self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype_id) - ] - - -def as_device_scalar(val, dtype=None): - if isinstance(val, (cudf.Scalar, DeviceScalar)): - if dtype == val.dtype or dtype is None: - if isinstance(val, DeviceScalar): - return val - else: - return val.device_value + elif cdtype_id == plc.TypeID.STRUCT: + self._dtype = StructDtype.from_arrow( + plc.interop.to_arrow(self.c_value).type + ) + elif cdtype_id == plc.TypeID.LIST: + self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type) else: - raise TypeError("Can't update dtype of existing GPU scalar") - else: - return cudf.Scalar(val, dtype=dtype).device_value - - -def _is_null_host_scalar(slr): - if cudf.utils.utils.is_na_like(slr): - return True - elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - slr is pd.NaT: - return True - else: - return False + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index 18b1d26e4db..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from pylibcudf.libcudf.column.column_view cimport column_view - -ctypedef int32_t underlying_type_t_type_id - -cdef dtype_from_column_view(column_view cv) - -cpdef dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index 777bd070b32..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -import pylibcudf as plc - -import cudf - - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - np.dtype("int8"): plc.types.TypeId.INT8, - np.dtype("int16"): plc.types.TypeId.INT16, - np.dtype("int32"): plc.types.TypeId.INT32, - np.dtype("int64"): plc.types.TypeId.INT64, - np.dtype("uint8"): plc.types.TypeId.UINT8, - np.dtype("uint16"): plc.types.TypeId.UINT16, - np.dtype("uint32"): plc.types.TypeId.UINT32, - np.dtype("uint64"): plc.types.TypeId.UINT64, - np.dtype("float32"): plc.types.TypeId.FLOAT32, - np.dtype("float64"): plc.types.TypeId.FLOAT64, - np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): plc.types.TypeId.STRING, - np.dtype("bool"): plc.types.TypeId.BOOL8, - np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, -} -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - plc_type: np_type - for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() -} -# There's no equivalent to EMPTY in cudf. We translate EMPTY -# columns from libcudf to ``int8`` columns of all nulls in Python. -# ``int8`` is chosen because it uses the least amount of memory. -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") - - -size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") - else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] - - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return plc.DataType(plc.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return plc.DataType(plc.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = plc.TypeId.DECIMAL128 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = plc.TypeId.DECIMAL64 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = plc.TypeId.DECIMAL32 - return plc.DataType(tid, -dtype.scale) - # libcudf types don't support timezones so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f"` as a preprocessing step to `__repr__` methods. @@ -2047,7 +2050,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if gather_map.dtype.kind not in "iu": - gather_map = gather_map.astype(size_type_dtype) + gather_map = gather_map.astype(SIZE_TYPE_DTYPE) GatherMap(gather_map, len(self), nullify=not check_bounds or nullify) return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py index 1d21d34b1bf..e6e6c3bcedf 100644 --- a/python/cudf/cudf/core/_internals/aggregation.py +++ b/python/cudf/cudf/core/_internals/aggregation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING, Literal @@ -8,9 +8,9 @@ import pylibcudf as plc import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf.api.types import is_scalar from cudf.utils import cudautils +from cudf.utils.dtypes import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES if TYPE_CHECKING: from collections.abc import Callable diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py index 212150f505e..a9023f8fd59 100644 --- a/python/cudf/cudf/core/_internals/binaryop.py +++ b/python/cudf/cudf/core/_internals/binaryop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING @@ -6,8 +6,8 @@ import pylibcudf as plc from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock +from cudf.utils.dtypes import dtype_to_pylibcudf_type if TYPE_CHECKING: from cudf._typing import Dtype diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py index 3b8e3db60a7..c45c4a1b5cf 100644 --- a/python/cudf/cudf/core/_internals/unary.py +++ b/python/cudf/cudf/core/_internals/unary.py @@ -1,13 +1,13 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING import pylibcudf as plc -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_decimal_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.utils.dtypes import dtype_to_pylibcudf_type if TYPE_CHECKING: from cudf._typing import Dtype diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index b49f5154697..0fe47255368 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: 1 this is it dtype: object """ - sep = cudf.Scalar(separator, dtype="str") return cudf.Series._from_column( - text._column.byte_pair_encoding(self.merge_pairs, sep) + text._column.byte_pair_encoding(self.merge_pairs, separator) ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b10b8dfe207..b9d6c0e7f08 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -12,12 +12,12 @@ from typing_extensions import Self import cudf -from cudf import _lib as libcudf from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, find_common_type, is_mixed_with_object_dtype, min_signed_type, @@ -621,7 +621,7 @@ def ordered(self) -> bool: def __setitem__(self, key, value): if cudf.api.types.is_scalar( value - ) and cudf._lib.scalar._is_null_host_scalar(value): + ) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): @@ -1140,7 +1140,7 @@ def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) + gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31efe267c96..30da8727366 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -25,8 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.scalar import as_device_scalar -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -61,9 +59,11 @@ from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_to_pylibcudf_type, find_common_type, get_time_unit, is_column_like, @@ -71,13 +71,14 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import _array_ufunc, mask_dtype +from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype if TYPE_CHECKING: import builtins from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.strings import StringColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -93,6 +94,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } + _PANDAS_NA_REPR = str(pd.NA) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -177,6 +180,17 @@ def __repr__(self): f"dtype: {self.dtype}" ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + if self.has_nulls(): + return self.astype("str").fillna(self._PANDAS_NA_REPR) + return self + def to_pandas( self, *, @@ -240,8 +254,12 @@ def find_and_replace( def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: plc_column = plc.replace.clamp( self.to_pylibcudf(mode="read"), - cudf.Scalar(lo, self.dtype).device_value.c_value, - cudf.Scalar(hi, self.dtype).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype)) + ), + plc.interop.from_arrow( + pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype)) + ), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -777,9 +795,7 @@ def fillna( if not self.has_nulls(include_nan=True): return self.copy() elif method is None: - if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( - fill_value - ): + if is_scalar(fill_value) and _is_null_host_scalar(fill_value): return self.copy() else: fill_value = self._validate_fillna_value(fill_value) @@ -859,7 +875,7 @@ def indices_of( value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: @@ -939,7 +955,7 @@ def take( # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) + indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] @@ -1018,7 +1034,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(cudf.Scalar(rhs.null_count > 0)) + result = result.fillna(rhs.null_count > 0) return result def as_mask(self) -> Buffer: @@ -1728,9 +1744,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1739,21 +1753,16 @@ def column_empty( cudf.core.column.NumericalColumn( data=as_buffer( rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize + size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize ) ), size=None, - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) + children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -1984,12 +1993,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - as_device_scalar( - arbitrary.start, dtype=np.dtype(np.int64) - ).c_value, - as_device_scalar( - arbitrary.step, dtype=np.dtype(np.int64) - ).c_value, + plc.interop.from_arrow( + pa.scalar(arbitrary.start, type=pa.int64()) + ), + plc.interop.from_arrow( + pa.scalar(arbitrary.step, type=pa.int64()) + ), ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: @@ -2537,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6a4122ebb9..1bde7d27700 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase): "__rsub__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, @@ -351,8 +353,8 @@ def is_year_end(self) -> ColumnBase: day_of_year = self.day_of_year leap_dates = self.is_leap_year - leap = day_of_year == cudf.Scalar(366) - non_leap = day_of_year == cudf.Scalar(365) + leap = day_of_year == 366 + non_leap = day_of_year == 365 return leap.copy_if_else(non_leap, leap_dates).fillna(False) @property diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d9440cdf21..04b4003c510 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column @@ -22,12 +21,14 @@ from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from collections.abc import Sequence from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): @@ -67,6 +68,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @cached_property def memory_usage(self): n = super().memory_usage @@ -236,7 +247,7 @@ def from_sequences( # Build Data, Mask & Offsets for data in arbitrary: - if cudf._lib.scalar._is_null_host_scalar(data): + if cudf.utils.utils._is_null_host_scalar(data): mask_col.append(False) offset_vals.append(offset) else: @@ -247,7 +258,7 @@ def from_sequences( offset_col = cast( NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), + column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE), ) # Build ListColumn @@ -274,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn: with acquire_spill_lock(): plc_column = plc.strings.convert.convert_lists.format_list_column( lc.to_pylibcudf(mode="read"), - cudf.Scalar("None").device_value.c_value, + plc.interop.from_arrow(pa.scalar("None")), separators.to_pylibcudf(mode="read"), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -380,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase: ) @acquire_spill_lock() - def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.contains( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), ) ) @acquire_spill_lock() - def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.index_of( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -558,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: dtype: bool """ return self._return_or_inplace( - self._column.contains_scalar(cudf.Scalar(search_key)) + self._column.contains_scalar(pa.scalar(search_key)) ) def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: @@ -607,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ if is_scalar(search_key): - result = self._column.index_of_scalar(cudf.Scalar(search_key)) + result = self._column.index_of_scalar(pa.scalar(search_key)) else: result = self._column.index_of_column(as_column(search_key)) return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4405e153b0c..70103745926 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba.np import numpy_support from typing_extensions import Self @@ -151,7 +152,7 @@ def __setitem__(self, key: Any, value: Any): cudf.Scalar( value, dtype=self.dtype - if cudf._lib.scalar._is_null_host_scalar(value) + if cudf.utils.utils._is_null_host_scalar(value) else None, ) if is_scalar(value) @@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn: elif self.dtype.kind == "b": conv_func = functools.partial( plc.strings.convert.convert_booleans.from_booleans, - true_string=cudf.Scalar( - "True", dtype="str" - ).device_value.c_value, - false_string=cudf.Scalar( - "False", dtype="str" - ).device_value.c_value, + true_string=plc.interop.from_arrow(pa.scalar("True")), + false_string=plc.interop.from_arrow(pa.scalar("False")), ) elif self.dtype.kind in {"i", "u"}: conv_func = plc.strings.convert.convert_integers.from_integers @@ -789,7 +786,7 @@ def _normalize_find_and_replace_input( ) # Scalar case if len(col_to_normalize) == 1: - if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): + if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]): return normalized_column.astype(input_column_dtype) if np.isinf(col_to_normalize[0]): return normalized_column diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcdcb789f23..2bee85cb387 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -19,16 +19,18 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -302,8 +304,10 @@ def cat(self, others=None, sep=None, na_rep=None): with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) else: @@ -359,8 +363,10 @@ def cat(self, others=None, sep=None, na_rep=None): ) ] ), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) @@ -522,11 +528,9 @@ def join( with acquire_spill_lock(): plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, - cudf._lib.scalar.DeviceScalar( - "", cudf.dtype("object") - ).c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), + plc.interop.from_arrow(pa.scalar("")), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -547,8 +551,8 @@ def join( plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), sep_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep_na_rep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep_na_rep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -800,14 +804,14 @@ def contains( else: if case is False: input_column = self.lower()._column # type: ignore[union-attr] - plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] + pat_normed = pat.lower() # type: ignore[union-attr] else: input_column = self._column - plc_pat = cudf.Scalar(pat, dtype="str") + pat_normed = pat with acquire_spill_lock(): plc_result = plc.strings.find.contains( input_column.to_pylibcudf(mode="read"), - plc_pat.device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat_normed)), ) result_col = Column.from_pylibcudf(plc_result) else: @@ -892,8 +896,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: with acquire_spill_lock(): plc_result = plc.strings.contains.like( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat, "str").device_value.c_value, - cudf.Scalar(esc, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(esc)), ) result = Column.from_pylibcudf(plc_result) @@ -1071,14 +1075,14 @@ def replace( plc.strings.regex_program.RegexProgram.create( pat, plc.strings.regex_flags.RegexFlags.DEFAULT ), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl)), n, ) else: plc_result = plc.strings.replace.replace( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat).device_value.c_value, - cudf.Scalar(repl).device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(repl)), n, ) result = Column.from_pylibcudf(plc_result) @@ -1194,13 +1198,13 @@ def slice( 2 cm dtype: object """ - param_dtype = np.dtype(np.int32) + param_dtype = pa.int32() with acquire_spill_lock(): plc_result = plc.strings.slice.slice_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(start, param_dtype).device_value.c_value, - cudf.Scalar(stop, param_dtype).device_value.c_value, - cudf.Scalar(step, param_dtype).device_value.c_value, + plc.interop.from_arrow(pa.scalar(start, param_dtype)), + plc.interop.from_arrow(pa.scalar(stop, param_dtype)), + plc.interop.from_arrow(pa.scalar(step, param_dtype)), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -2174,7 +2178,7 @@ def filter_alphanum( plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep else plc.strings.char_types.StringCharacterTypes.ALPHANUM, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, @@ -2318,7 +2322,7 @@ def slice_replace( with acquire_spill_lock(): plc_result = plc.strings.replace.replace_slice( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), start, stop, ) @@ -2499,7 +2503,7 @@ def get_json_object( with acquire_spill_lock(): plc_result = plc.json.get_json_object( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(json_path, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(json_path)), options, ) result = Column.from_pylibcudf(plc_result) @@ -2657,7 +2661,12 @@ def split( if regex is True: data = self._column.split_re(pat, n) else: - data = self._column.split(cudf.Scalar(pat, "str"), n) + data = self._column.split( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2667,7 +2676,7 @@ def split( result_table = self._column.split_record_re(pat, n) else: result_table = self._column.split_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2829,7 +2838,12 @@ def rsplit( if regex is True: data = self._column.rsplit_re(pat, n) else: - data = self._column.rsplit(cudf.Scalar(pat, "str"), n) + data = self._column.rsplit( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2839,7 +2853,7 @@ def rsplit( result_table = self._column.rsplit_record_re(pat, n) else: result_table = self._column.rsplit_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2924,7 +2938,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.partition(cudf.Scalar(sep, "str")), + self._column.partition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -2989,7 +3005,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.rpartition(cudf.Scalar(sep, "str")), + self._column.rpartition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -3303,7 +3321,7 @@ def _strip( plc_result = plc.strings.strip.strip( self._column.to_pylibcudf(mode="read"), side, - cudf.Scalar(to_strip, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -3920,7 +3938,7 @@ def _starts_ends_with( f"{type(pat).__name__}" ) elif is_scalar(pat): - plc_pat = cudf.Scalar(pat, "str").device_value.c_value + plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string())) else: plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( mode="read" @@ -4120,7 +4138,7 @@ def _find( with acquire_spill_lock(): plc_result = method( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sub, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sub, type=pa.string())), start, end, ) @@ -4603,7 +4621,7 @@ def filter_characters( plc.strings.translate.FilterType.KEEP if keep else plc.strings.translate.FilterType.REMOVE, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -4710,10 +4728,10 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: if isinstance(delim, Column): result = self._return_or_inplace( - self._column.tokenize_column(delim), + self._column.tokenize_column(delim), # type: ignore[arg-type] retain_index=False, ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): result = self._return_or_inplace( self._column.tokenize_scalar(delim), retain_index=False, @@ -4851,10 +4869,10 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delim, Column): return self._return_or_inplace( - self._column.count_tokens_column(delim) + self._column.count_tokens_column(delim) # type: ignore[arg-type] ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): return self._return_or_inplace( self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) @@ -5112,7 +5130,7 @@ def replace_tokens( self._column.replace_tokens( targets_column, # type: ignore[arg-type] replacements_column, # type: ignore[arg-type] - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5181,8 +5199,10 @@ def filter_tokens( return self._return_or_inplace( self._column.filter_tokens( min_token_length, - cudf.Scalar(replacement, dtype="str"), - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow( + pa.scalar(replacement, type=pa.string()) + ), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5501,12 +5521,12 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: def _massage_string_arg( value, name, allow_col: bool = False -) -> StringColumn | cudf.Scalar: +) -> StringColumn | plc.Scalar: if isinstance(value, cudf.Scalar): return value if isinstance(value, str): - return cudf.Scalar(value, dtype="str") + return plc.interop.from_arrow(pa.scalar(value, type=pa.string())) allowed_types = ["Scalar"] @@ -5593,7 +5613,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype + 0, length=size + 1, dtype=SIZE_TYPE_DTYPE ) children = (offsets,) @@ -5747,8 +5767,8 @@ def sum( with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( result_col.to_pylibcudf(mode="read"), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar(None, type=pa.string())), ) return Column.from_pylibcudf(plc_column).element_indexing(0) else: @@ -5766,7 +5786,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: self.to_pylibcudf(mode="read") ) result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + return (result > np.int8(0)).fillna(False) elif out_dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( @@ -5870,7 +5890,7 @@ def as_decimal_column( ) -> cudf.core.column.DecimalBaseColumn: plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( self.to_pylibcudf(mode="read"), - libcudf.types.dtype_to_pylibcudf_type(dtype), + dtype_to_pylibcudf_type(dtype), ) result = Column.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] @@ -6033,8 +6053,10 @@ def _binaryop( rhs.to_pylibcudf(mode="read"), ] ), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow( + pa.scalar(None, type=pa.string()) + ), ) return Column.from_pylibcudf(plc_column) elif op in { @@ -6120,11 +6142,11 @@ def jaccard_index(self, other: Self, width: int) -> NumericalColumn: return type(self).from_pylibcudf(result) # type: ignore[return-value] @acquire_spill_lock() - def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self: result = plc.nvtext.generate_ngrams.generate_ngrams( self.to_pylibcudf(mode="read"), ngrams, - separator.device_value.c_value, + separator, ) return type(self).from_pylibcudf(result) # type: ignore[return-value] @@ -6160,13 +6182,13 @@ def edit_distance_matrix(self) -> ListColumn: def byte_pair_encoding( self, merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, - separator: cudf.Scalar, + separator: str, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.byte_pair_encode.byte_pair_encoding( self.to_pylibcudf(mode="read"), merge_pairs, - separator.device_value.c_value, + plc.interop.from_arrow(pa.scalar(separator)), ) ) @@ -6174,15 +6196,15 @@ def byte_pair_encoding( def ngrams_tokenize( self, ngrams: int, - delimiter: cudf.Scalar, - separator: cudf.Scalar, + delimiter: plc.Scalar, + separator: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.ngrams_tokenize.ngrams_tokenize( self.to_pylibcudf(mode="read"), ngrams, - delimiter.device_value.c_value, - separator.device_value.c_value, + delimiter, + separator, ) ) @@ -6205,14 +6227,14 @@ def normalize_characters(self, do_lower: bool = True) -> Self: @acquire_spill_lock() def replace_tokens( - self, targets: Self, replacements: Self, delimiter: cudf.Scalar + self, targets: Self, replacements: Self, delimiter: plc.Scalar ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.replace_tokens( self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read"), replacements.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) ) @@ -6220,15 +6242,15 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: cudf.Scalar, - delimiter: cudf.Scalar, + replacement: plc.Scalar, + delimiter: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.filter_tokens( self.to_pylibcudf(mode="read"), min_token_length, - replacement.device_value.c_value, - delimiter.device_value.c_value, + replacement, + delimiter, ) ) @@ -6279,10 +6301,10 @@ def subword_tokenize( return tokens, masks, metadata @acquire_spill_lock() - def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + def tokenize_scalar(self, delimiter: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6296,10 +6318,10 @@ def tokenize_column(self, delimiters: Self) -> Self: ) @acquire_spill_lock() - def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.count_tokens_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6324,25 +6346,25 @@ def character_tokenize(self) -> Self: def tokenize_with_vocabulary( self, vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, - delimiter: cudf.Scalar, + delimiter: str, default_id: int, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_with_vocabulary( self.to_pylibcudf(mode="read"), vocabulary, - delimiter.device_value.c_value, + plc.interop.from_arrow(pa.scalar(delimiter)), default_id, ) ) @acquire_spill_lock() - def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.detokenize( self.to_pylibcudf(mode="read"), indices.to_pylibcudf(mode="read"), - separator.device_value.c_value, + separator, ) ) @@ -6491,23 +6513,23 @@ def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: @acquire_spill_lock() def _split_record( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> Self: plc_column = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] - def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.split_record ) - def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.rsplit_record ) @@ -6515,13 +6537,13 @@ def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: @acquire_spill_lock() def _split( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return dict( @@ -6531,21 +6553,21 @@ def _split( ) ) - def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.split) - def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) @acquire_spill_lock() def _partition( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, method: Callable[[plc.Column, plc.Scalar], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) return dict( enumerate( @@ -6554,12 +6576,12 @@ def _partition( ) ) - def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def partition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.partition ) - def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.rpartition ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ba765b50729..052a68cec98 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -18,6 +18,7 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class StructColumn(ColumnBase): @@ -51,6 +52,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @staticmethod def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: # IntervalDtype is a subclass of StructDtype, so compare types exactly diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 749ab8e837a..302178ea277 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase): "__rfloordiv__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4b6ad59c8e1..aaaf6c7ee4f 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,11 +1,11 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from cudf.core.column import NumericalColumn @@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # Alternately we can have an Optional[Column] and handle None # specially in _gather. self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) + "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE) ) else: if self.column.dtype.kind not in {"i", "u"}: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3334b57ce1b..5cea35ac0d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -92,7 +92,11 @@ min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +from cudf.utils.utils import ( + GetAttrGetItemMixin, + _external_only_api, + _is_null_host_scalar, +) if TYPE_CHECKING: from cudf._typing import ColumnLike, Dtype, NotImplementedType @@ -1890,7 +1894,7 @@ def astype( dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) - def _clean_renderable_dataframe(self, output): + def _clean_renderable_dataframe(self, output: Self) -> str: """ This method takes in partial/preprocessed dataframe and returns correct representation of it with correct @@ -1925,41 +1929,7 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): + def _get_renderable_dataframe(self) -> Self: """ Takes rows and columns from pandas settings or estimation from size. pulls quadrants based off of some known parameters then style for @@ -1967,9 +1937,9 @@ def _get_renderable_dataframe(self): for printing with the dataframe. """ max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) + if max_rows in {0, None}: + max_rows = len(self) + nrows = max(max_rows, 1) ncols = ( pd.options.display.max_columns if pd.options.display.max_columns @@ -1977,7 +1947,7 @@ def _get_renderable_dataframe(self): ) if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) + output = self elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items # In case of Empty DataFrame with index, Pandas prints @@ -2037,10 +2007,7 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output + return output._pandas_repr_compatible() @_performance_tracking def __repr__(self): @@ -3371,7 +3338,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if isinstance(value, (np.ndarray, cupy.ndarray)): dtype = value.dtype value = value.item() - if libcudf.scalar._is_null_host_scalar(value): + if _is_null_host_scalar(value): dtype = "str" value = as_column( value, @@ -6262,10 +6229,8 @@ def isin(self, values): # TODO: propagate nulls through isin # https://github.com/rapidsai/cudf/issues/7556 - fill_value = cudf.Scalar(False) - def make_false_column_like_self(): - return column.as_column(fill_value, length=len(self), dtype="bool") + return column.as_column(False, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ed233ba737..ce7fb968069 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import decimal @@ -57,7 +57,8 @@ def dtype(arbitrary): if np_dtype.kind in set("OU"): return np.dtype("object") elif ( - np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + np_dtype + not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8f45c6f0115..abf9f7b3686 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -820,6 +820,13 @@ def fillna( inplace=inplace, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + columns = (col._prep_pandas_compat_repr() for col in self._columns) + return self._from_data_like_self( + self._data._from_columns_like_self(columns, verify=False) + ) + @_performance_tracking def _drop_column( self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6ae524d6346..7bc4b08fc49 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,13 +14,13 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pylibcudf as plc import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -45,6 +45,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -586,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype + range(len(self.obj)), dtype=SIZE_TYPE_DTYPE ) ] ) @@ -852,7 +853,9 @@ def _shift( plc.table.Table([col.to_pylibcudf(mode="read") for col in values]), [periods] * len(values), [ - cudf.Scalar(val, dtype=col.dtype).device_value.c_value + plc.interop.from_arrow( + pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype)) + ) for val, col in zip(fill_values, values) ], ) @@ -1181,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -1195,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) + to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -1496,11 +1499,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape + SIZE_TYPE_DTYPE.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -1513,7 +1516,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + ).astype(SIZE_TYPE_DTYPE) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1521,7 +1524,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) + indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -1529,7 +1532,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) + indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -1553,7 +1556,7 @@ def sample( indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) + want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 85be8d21d27..0d1bf552982 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -53,6 +52,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase: else: return column.column_empty(0, dtype=self.dtype) - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self def _is_numeric(self) -> bool: @@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=size_type_dtype) + return as_column(i, dtype=SIZE_TYPE_DTYPE) def isin(self, values, level=None): if level is not None and level > 0: @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out - @classmethod @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out + def _from_data_like_self(self, data: MutableMapping) -> Self: + return _index_from_data(data, self.name) @classmethod @_performance_tracking @@ -1354,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = as_column( -1, length=len(needle), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): @@ -1494,7 +1488,7 @@ def __repr__(self) -> str: if isinstance(self._values, StringColumn): output = repr(self.to_pandas(nullable=True)) else: - output = repr(self._clean_nulls_from_index().to_pandas()) + output = repr(self._pandas_repr_compatible().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool: hash(item) return item in self._column - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - def any(self) -> bool: return self._column.any() @@ -2347,8 +2327,7 @@ def microsecond(self) -> Index: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - self._column.millisecond.astype("int32") - * cudf.Scalar(1000, dtype="int32") + self._column.millisecond.astype("int32") * np.int32(1000) ) + self._column.microsecond, name=self.name, @@ -3615,7 +3594,7 @@ def _is_interval(self) -> bool: def _is_boolean(self) -> bool: return False - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self @property diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6854cb02aa5..4c6f8a9c152 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -60,6 +60,7 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -2836,16 +2837,22 @@ def hash_values( Parameters ---------- - method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' + method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3' Hash function to use: * murmur3: MurmurHash3 hash function - * md5: MD5 hash function + * xxhash32: xxHash32 hash function * xxhash64: xxHash64 hash function + * md5: MD5 hash function + * sha1: SHA-1 hash function + * sha224: SHA-224 hash function + * sha256: SHA-256 hash function + * sha384: SHA-384 hash function + * sha512: SHA-512 hash function seed : int, optional Seed value to use for the hash function. This parameter is only - supported for 'murmur3' and 'xxhash64'. + supported for 'murmur3', 'xxhash32', and 'xxhash64'. Returns @@ -2900,7 +2907,7 @@ def hash_values( 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ - seed_hash_methods = {"murmur3", "xxhash64"} + seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"} if seed is None: seed = 0 elif method not in seed_hash_methods: @@ -2914,6 +2921,8 @@ def hash_values( ) if method == "murmur3": plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash32": + plc_column = plc.hashing.xxhash_32(plc_table, seed) elif method == "xxhash64": plc_column = plc.hashing.xxhash_64(plc_table, seed) elif method == "md5": @@ -3026,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: NumericalColumn, as_column( range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ), len(self), @@ -3247,7 +3256,7 @@ def duplicated( ) distinct = libcudf.column.Column.from_pylibcudf(plc_column) result = copying.scatter( - [cudf.Scalar(False, dtype=bool)], + [cudf.Scalar(False)], distinct, [as_column(True, length=len(self), dtype=bool)], bounds_check=False, @@ -4402,6 +4411,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): index_names=self.index.names if keep_index else None, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + result = super()._pandas_repr_compatible() + result.index = self.index._pandas_repr_compatible() + return result + def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6e965ceca66..ce7edc8fdbe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any @@ -7,7 +7,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap @@ -17,6 +16,7 @@ _IndexIndexer, _match_join_keys, ) +from cudf.utils.dtypes import SIZE_TYPE_DTYPE class Merge: @@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols): # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. key_order = [ - cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) for map_, n, null in zip(maps, lengths, nullify) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..64ec099cb39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -17,7 +17,6 @@ import cudf import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -34,7 +33,7 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -199,7 +198,7 @@ def __init__( ) if lo == -1: # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min + code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min result_col = level._column.take(code, nullify=True) source_data[i] = result_col._with_type_metadata(level.dtype) @@ -361,6 +360,13 @@ def _from_data( name=name, ) + @_performance_tracking + def _from_data_like_self(self, data: MutableMapping) -> Self: + mi = type(self)._from_data(data, name=self.name) + if mi.nlevels == self.nlevels: + mi.names = self.names + return mi + @classmethod def _simple_new( cls, @@ -1571,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index: def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code + # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code # pandas uses -1 as missing code pd_codes = ( code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1), column.as_column(-1, length=1), ) for code in self._codes @@ -1753,16 +1759,6 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - @_performance_tracking def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) @@ -1906,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = column.as_column( -1, length=len(target), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): return _return_get_indexer_result(result.values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0abd42d4d4e..eedd777aafe 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -12,13 +12,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type if TYPE_CHECKING: from cudf._typing import Dtype @@ -1333,10 +1332,10 @@ def _one_hot_encode_column( else: column = column._get_decategorized_column() # type: ignore[attr-defined] - if column.size * categories.size >= np.iinfo(size_type_dtype).max: + if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max: raise ValueError( "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " + f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing " "size of category" ) result_labels = ( diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 80dd0921f9c..7d246960cc9 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -178,13 +178,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not cudf._lib.scalar._is_null_host_scalar(self._host_value) + return not cudf.utils.utils._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not cudf._lib.scalar._is_null_host_scalar(value) + valid = not cudf.utils.utils._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 49c2c8cf387..805f9f9a9f9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1449,35 +1449,16 @@ def __repr__(self): warnings.simplefilter("ignore", FutureWarning) preprocess = cudf.concat([top, bottom]) else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): + preprocess = self + if isinstance(preprocess.dtype, cudf.CategoricalDtype): min_rows = ( height if pd.get_option("display.min_rows") == 0 else pd.get_option("display.min_rows") ) show_dimensions = pd.get_option("display.show_dimensions") + preprocess = preprocess.copy(deep=False) + preprocess.index = preprocess.index._pandas_repr_compatible() if preprocess.dtype.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -1502,7 +1483,7 @@ def __repr__(self): na_rep=str(cudf.NA), ) else: - output = repr(preprocess.to_pandas()) + output = repr(preprocess._pandas_repr_compatible().to_pandas()) lines = output.split("\n") if isinstance(preprocess.dtype, cudf.CategoricalDtype): @@ -4125,8 +4106,8 @@ def microsecond(self) -> Series: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - extra = self.series._column.millisecond.astype("int32") * cudf.Scalar( - 1000, dtype="int32" + extra = self.series._column.millisecond.astype("int32") * np.int32( + 1000 ) return self._return_result_like_self(micro + extra) diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index fb8b9b3131c..58dabc85491 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -42,9 +42,8 @@ def tokenize( """ if delimiter is None: delimiter = "" - delim = cudf.Scalar(delimiter, dtype="str") result = text._column.tokenize_with_vocabulary( - self.vocabulary, delim, default_id + self.vocabulary, delimiter, default_id ) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2f8a6d9e5e7..e2c332f34f5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION +# Copyright (c) 2020-2025, NVIDIA CORPORATION from __future__ import annotations import warnings from typing import TYPE_CHECKING import numba +import numpy as np import pandas as pd from pandas.api.indexers import BaseIndexer @@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) + preceding_window = (idx - start + np.int32(1)).astype("int32") + following_window = (end - idx - np.int32(1)).astype("int32") window = None else: preceding_window = as_column(self.window) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 6d617cbf38e..7e8468c8e8a 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import errno @@ -16,11 +16,13 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) from cudf.utils.performance_tracking import _performance_tracking _CSV_HEX_TYPE_MAP = { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ff326e09315..16c7d189dfd 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import os @@ -14,10 +14,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from cudf.core.column import ColumnBase diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f3124552fd1..0ac2950a22b 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -11,11 +11,11 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.index import _index_from_data from cudf.utils import ioutils +from cudf.utils.dtypes import dtype_to_pylibcudf_type try: import ujson as json # type: ignore[import-untyped] diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc new file mode 100644 index 00000000000..a0ea4fbbfc2 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc new file mode 100644 index 00000000000..8a7969cdbbb Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc differ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 11a9b398b50..f3cf8e36a5b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr import contextlib @@ -1440,6 +1440,7 @@ def test_assign_callable(mapping): "sha256", "sha384", "sha512", + "xxhash32", "xxhash64", ], ) @@ -1447,6 +1448,7 @@ def test_assign_callable(mapping): def test_dataframe_hash_values(nrows, method, seed): warning_expected = seed is not None and method not in { "murmur3", + "xxhash32", "xxhash64", } potential_warning = ( @@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed): "sha256": object, "sha384": object, "sha512": object, + "xxhash32": np.uint32, "xxhash64": np.uint64, } assert out.dtype == expected_dtypes[method] @@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed): assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) -@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) +@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) def test_dataframe_hash_values_seed(method): gdf = cudf.DataFrame() data = np.arange(10) @@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method): assert_neq(out_one, out_two) +def test_dataframe_hash_values_xxhash32(): + # xxhash32 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash32", seed=0) + expected_a = cudf.Series( + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash32", seed=42) + expected_b = cudf.Series( + [1076387279, 2261349915, 531498073, 650869264, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash32", seed=0) + expected_df = cudf.Series( + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_df, expected_df) + + def test_dataframe_hash_values_xxhash64(): # xxhash64 has no built-in implementation in Python and we don't want to # add a testing dependency, so we use regression tests against known good diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index da0aa5be6f5..b1f81edfc54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import functools import operator @@ -14,6 +14,7 @@ from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES +from cudf.utils.dtypes import cudf_dtype_to_pa_type @pytest.mark.parametrize( @@ -423,7 +424,9 @@ def test_get_ind_sequence(): def test_contains_scalar(data, scalar, expect): sr = cudf.Series(data) expect = cudf.Series(expect) - got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -455,7 +458,9 @@ def test_contains_scalar(data, scalar, expect): def test_contains_null_search_key(data, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -518,12 +523,12 @@ def test_contains_invalid(data, scalar): ), ( [["d", None, "e"], [None, "f"], []], - cudf.Scalar(cudf.NA, "O"), + pa.scalar(None, type=pa.string()), [None, None, None], ), ( [None, [10, 9, 8], [5, 8, None]], - cudf.Scalar(cudf.NA, "int64"), + pa.scalar(None, type=pa.int64()), [None, None, None], ), ], @@ -532,7 +537,11 @@ def test_index(data, search_key, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="int32") if is_scalar(search_key): - got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) + got = sr.list.index( + pa.scalar( + search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) + ) + ) else: got = sr.list.index( cudf.Series(search_key, dtype=sr.dtype.element_type) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c4b4ef60184..fe143e66407 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime import decimal @@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir): got = cudf.read_orc(buffer) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "inputfile", + [ + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", + ], +) +def test_orc_reader_desynced_timestamp(datadir, inputfile): + # Test a special case where the DATA stream (second) in a TIMESTAMP column + # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row + # group. In this case, the "run cache manager" in the decoder kernel is used to + # orchestrate the dual-stream processing. + # For more information, see https://github.com/rapidsai/cudf/issues/17155. + + path = datadir / inputfile + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_frame_equal(cudf.from_pandas(expect), got) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bf0c97adb00..2cb742727cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import textwrap @@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 dtype: timedelta64[ms] """ ), @@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 dtype: timedelta64[ms] """ ), @@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 dtype: timedelta64[ms] """ ), @@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 Name: abc, dtype: timedelta64[ms] """ ), diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index d9dde58d998..574170d28c6 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. import glob import os @@ -130,9 +130,7 @@ def _setup_numba(): if driver_version < (12, 0): patch_numba_linker_cuda_11() else: - from pynvjitlink.patch import patch_numba_linker - - patch_numba_linker() + numba_config.CUDA_ENABLE_PYNVJITLINK = True class _CUDFNumbaConfig: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ca8f9cac2d0..9e932acb5fa 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime @@ -11,6 +11,8 @@ import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object +import pylibcudf as plc + import cudf if TYPE_CHECKING: @@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) @@ -198,7 +200,7 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( + if cudf.utils.utils._is_null_host_scalar(val) or isinstance( val, cudf.Scalar ): return val @@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def dtype_to_pylibcudf_type(dtype) -> plc.DataType: + if isinstance(dtype, cudf.ListDtype): + return plc.DataType(plc.TypeId.LIST) + elif isinstance(dtype, cudf.StructDtype): + return plc.DataType(plc.TypeId.STRUCT) + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type + elif isinstance(dtype, pd.DatetimeTZDtype): + dtype = _get_base_dtype(dtype) + else: + dtype = np.dtype(dtype) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) + + +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, +} +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() +} +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype( + "object" +) +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") + + +SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c83c1cbe895..0adaaa60654 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -341,6 +341,15 @@ def is_na_like(obj): return obj is None or obj is cudf.NA or obj is cudf.NaT +def _is_null_host_scalar(slr) -> bool: + # slr is NA like or NaT like + return ( + is_na_like(slr) + or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) + or slr is pd.NaT + ) + + def _warn_no_dask_cudf(fn): @functools.wraps(fn) def wrapper(self): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 2fdf6b34b8f..c6a5887f85d 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", - "numba-cuda>=0.0.13,<0.0.18", + "numba-cuda>=0.2.0,<0.3.0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1c1d4860eec..fd56329a48e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """ DSL nodes for the LogicalPlan of polars. @@ -34,9 +34,11 @@ from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: - from collections.abc import Callable, Hashable, MutableMapping, Sequence + from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence from typing import Literal + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.typing import Schema @@ -1019,7 +1021,27 @@ class ConditionalJoin(IR): __slots__ = ("ast_predicate", "options", "predicate") _non_child = ("schema", "predicate", "options") predicate: expr.Expr - options: tuple + """Expression predicate to join on""" + options: tuple[ + tuple[ + str, + pl_expr.Operator | Iterable[pl_expr.Operator], + ], + bool, + tuple[int, int] | None, + str, + bool, + Literal["none", "left", "right", "left_right", "right_left"], + ] + """ + tuple of options: + - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions + - join_nulls: do nulls compare equal? + - slice: optional slice to perform after joining. + - suffix: string suffix for right columns if names match + - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any + """ def __init__( self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR @@ -1029,15 +1051,16 @@ def __init__( self.options = options self.children = (left, right) self.ast_predicate = to_ast(predicate) - _, join_nulls, zlice, suffix, coalesce = self.options + _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options # Preconditions from polars assert not join_nulls assert not coalesce + assert maintain_order == "none" if self.ast_predicate is None: raise NotImplementedError( f"Conditional join with predicate {predicate}" ) # pragma: no cover; polars never delivers expressions we can't handle - self._non_child_args = (self.ast_predicate, zlice, suffix) + self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order) @classmethod def do_evaluate( @@ -1045,6 +1068,7 @@ def do_evaluate( predicate: plc.expressions.Expression, zlice: tuple[int, int] | None, suffix: str, + maintain_order: Literal["none", "left", "right", "left_right", "right_left"], left: DataFrame, right: DataFrame, ) -> DataFrame: @@ -1088,6 +1112,7 @@ class Join(IR): tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ] """ tuple of options: @@ -1096,6 +1121,7 @@ class Join(IR): - slice: optional slice to perform after joining. - suffix: string suffix for right columns if names match - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any """ def __init__( @@ -1113,6 +1139,9 @@ def __init__( self.options = options self.children = (left, right) self._non_child_args = (self.left_on, self.right_on, self.options) + # TODO: Implement maintain_order + if options[5] != "none": + raise NotImplementedError("maintain_order not implemented yet") if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -1222,12 +1251,13 @@ def do_evaluate( tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ], left: DataFrame, right: DataFrame, ) -> DataFrame: """Evaluate and return a dataframe.""" - how, join_nulls, zlice, suffix, coalesce = options + how, join_nulls, zlice, suffix, coalesce, _ = options if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 37cf36dc4dd..2138ac0c700 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Translate polars IR representation to ours.""" @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (4, 0): + if (version := self.visitor.version()) >= (4, 3): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 7a759eea2e9..c16df320ceb 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Plugin for running polars test suite setting GPU engine as default.""" @@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-0]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-2]": "Need to add include_file_path to IR", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", @@ -140,6 +145,22 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", @@ -174,6 +195,19 @@ def pytest_configure(config: pytest.Config) -> None: } +TESTS_TO_SKIP: Mapping[str, str] = { + # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks + # for obsolete timezone names. However, the chrono_tz package that + # polars uses doesn't read /usr/share/zoneinfo, instead packaging + # the current zoneinfo database from IANA. Consequently, when this + # hypothesis-generated test runs and generates timezones from the + # available zoneinfo-reported timezones, we can get an error from + # polars that the requested timezone is unknown. + # Since this is random, just skip it, rather than xfailing. + "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", +} + + def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] ) -> None: @@ -182,5 +216,7 @@ def pytest_collection_modifyitems( # Don't xfail tests if running without fallback return for item in items: - if item.nodeid in EXPECTED_FAILURES: + if item.nodeid in TESTS_TO_SKIP: + item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) + elif item.nodeid in EXPECTED_FAILURES: item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 5904942aea2..9fb9bbf391e 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.15", + "polars>=1.11,<1.18", "pylibcudf==25.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 2fcbbf21f1c..f1f47bfb9f1 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -53,6 +53,15 @@ def right(): ) +@pytest.mark.parametrize( + "maintain_order", ["left", "left_right", "right_left", "right"] +) +def test_join_maintain_order_param_unsupported(left, right, maintain_order): + q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "join_expr", [ diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 20eb2404b77..863102103ed 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,7 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from importlib import import_module +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import dask.dataframe as dd from dask import config @@ -9,11 +6,16 @@ import cudf -from . import backends # noqa: F401 +from . import backends, io # noqa: F401 +from ._expr.expr import _patch_dask_expr from ._version import __git_commit__, __version__ # noqa: F401 -from .core import DataFrame, Index, Series, concat, from_cudf +from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf -QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED +if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()): + raise ValueError( + "The legacy DataFrame API is not supported in dask_cudf>24.12. " + "Please enable query-planning, or downgrade to dask_cudf<=24.12" + ) def read_csv(*args, **kwargs): @@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def _deprecated_api(old_api, new_api=None, rec=None): - def inner_func(*args, **kwargs): - if new_api: - # Use alternative - msg = f"{old_api} is now deprecated. " - msg += rec or f"Please use {new_api} instead." - warnings.warn(msg, FutureWarning) - new_attr = new_api.split(".") - module = import_module(".".join(new_attr[:-1])) - return getattr(module, new_attr[-1])(*args, **kwargs) - - # No alternative - raise an error - raise NotImplementedError( - f"{old_api} is no longer supported. " + (rec or "") - ) - - return inner_func - - -if QUERY_PLANNING_ON: - from . import io - from ._expr.expr import _patch_dask_expr - - groupby_agg = _deprecated_api("dask_cudf.groupby_agg") - read_text = DataFrame.read_text - _patch_dask_expr() - -else: - from . import io # noqa: F401 - from ._legacy.groupby import groupby_agg # noqa: F401 - from ._legacy.io import read_text # noqa: F401 - - +groupby_agg = _deprecated_api("dask_cudf.groupby_agg") +read_text = DataFrame.read_text to_orc = _deprecated_api( "dask_cudf.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.to_orc", rec="Please use DataFrame.to_orc instead.", ) +_patch_dask_expr() + + __all__ = [ "DataFrame", "Index", diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 5192e6b8171..e8c9a970b7b 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import warnings from functools import cached_property @@ -15,19 +15,11 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.dataframe.dispatch import get_parallel_type from dask.typing import no_default import cudf -_LEGACY_WORKAROUND = ( - "To enable the 'legacy' dask-cudf API, set the " - "global 'dataframe.query-planning' config to " - "`False` before dask is imported. This can also " - "be done by setting an environment variable: " - "`DASK_DATAFRAME__QUERY_PLANNING=False` " -) - - ## ## Custom collection classes ## @@ -103,9 +95,8 @@ def set_index( divisions = None warnings.warn( "Ignoring divisions='quantile'. This option is now " - "deprecated. Please use the legacy API and raise an " - "issue on github if this feature is necessary." - f"\n{_LEGACY_WORKAROUND}", + "deprecated. Please raise an issue on github if this " + "feature is necessary.", FutureWarning, ) @@ -135,9 +126,7 @@ def groupby( if kwargs.pop("as_index") is not True: raise NotImplementedError( - f"{msg} Please reset the index after aggregating, or " - "use the legacy API if `as_index=False` is required.\n" - f"{_LEGACY_WORKAROUND}" + f"{msg} Please reset the index after aggregating." ) else: warnings.warn(msg, FutureWarning) @@ -153,15 +142,15 @@ def groupby( ) def to_orc(self, *args, **kwargs): - from dask_cudf._legacy.io import to_orc + from dask_cudf.io.orc import to_orc as to_orc_impl - return to_orc(self, *args, **kwargs) + return to_orc_impl(self, *args, **kwargs) @staticmethod def read_text(*args, **kwargs): - from dask_cudf._legacy.io.text import read_text as legacy_read_text + from dask_cudf.io.text import read_text as read_text_impl - return legacy_read_text(*args, **kwargs) + return read_text_impl(*args, **kwargs) def clip(self, lower=None, upper=None, axis=1): if axis not in (None, 1): @@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) +# dask.dataframe dispatch +get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) +get_parallel_type.register(cudf.Series, lambda _: Series) +get_parallel_type.register(cudf.BaseIndex, lambda _: Index) + + +# dask_expr dispatch (might go away?) get_collection_type.register(cudf.DataFrame, lambda _: DataFrame) get_collection_type.register(cudf.Series, lambda _: Series) get_collection_type.register(cudf.BaseIndex, lambda _: Index) diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py index 8b91e53604c..03d1da0d258 100644 --- a/python/dask_cudf/dask_cudf/_expr/expr.py +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import functools import dask_expr._shuffle as _shuffle_module @@ -7,13 +7,13 @@ from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var -from dask.dataframe.core import ( - is_dataframe_like, +from dask.dataframe.dispatch import ( + is_categorical_dtype, make_meta, meta_nonempty, ) -from dask.dataframe.dispatch import is_categorical_dtype from dask.typing import no_default +from dask.utils import is_dataframe_like import cudf diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py index 0242fac6e72..a5cdd43169b 100644 --- a/python/dask_cudf/dask_cudf/_expr/groupby.py +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd from dask_expr._collection import new_collection from dask_expr._groupby import ( @@ -16,11 +17,262 @@ from dask.dataframe.groupby import Aggregation from cudf.core.groupby.groupby import _deprecate_collect +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking ## ## Fused groupby aggregations ## +OPTIMIZED_AGGS = ( + "count", + "mean", + "std", + "var", + "sum", + "min", + "max", + list, + "first", + "last", +) + + +def _make_name(col_name, sep="_"): + """Combine elements of `col_name` into a single string, or no-op if + `col_name` is already a string + """ + if isinstance(col_name, str): + return col_name + return sep.join(name for name in col_name if name != "") + + +@_dask_cudf_performance_tracking +def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): + """Initial partition-level aggregation task. + + This is the first operation to be executed on each input + partition in `groupby_agg`. Depending on `aggs`, four possible + groupby aggregations ("count", "sum", "min", and "max") are + performed. The result is then partitioned (by hashing `gb_cols`) + into a number of distinct dictionary elements. The number of + elements in the output dictionary (`split_out`) corresponds to + the number of partitions in the final output of `groupby_agg`. + """ + + # Modify dict for initial (partition-wise) aggregations + _agg_dict = {} + for col, agg_list in aggs.items(): + _agg_dict[col] = set() + for agg in agg_list: + if agg in ("mean", "std", "var"): + _agg_dict[col].add("count") + _agg_dict[col].add("sum") + else: + _agg_dict[col].add(agg) + _agg_dict[col] = list(_agg_dict[col]) + if set(agg_list).intersection({"std", "var"}): + pow2_name = _make_name((col, "pow2"), sep=sep) + df[pow2_name] = df[col].astype("float64").pow(2) + _agg_dict[pow2_name] = ["sum"] + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + _agg_dict + ) + output_columns = [_make_name(name, sep=sep) for name in gb.columns] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _tree_node_agg(df, gb_cols, dropna, sort, sep): + """Node in groupby-aggregation reduction tree. + + The input DataFrame (`df`) corresponds to the + concatenated output of one or more `_groupby_partition_agg` + tasks. In this function, "sum", "min" and/or "max" groupby + aggregations will be used to combine the statistics for + duplicate keys. + """ + + agg_dict = {} + for col in df.columns: + if col in gb_cols: + continue + agg = col.split(sep)[-1] + if agg in ("count", "sum"): + agg_dict[col] = ["sum"] + elif agg == "list": + agg_dict[col] = [list] + elif agg in OPTIMIZED_AGGS: + agg_dict[col] = [agg] + else: + raise ValueError(f"Unexpected aggregation: {agg}") + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + agg_dict + ) + + # Don't include the last aggregation in the column names + output_columns = [ + _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) + for name in gb.columns + ] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): + """Calculate variance (given count, sum, and sum-squared columns).""" + + # Select count, sum, and sum-squared + n = df[count_name] + x = df[sum_name] + x2 = df[pow2_sum_name] + + # Use sum-squared approach to get variance + var = x2 - x**2 / n + div = n - ddof + div[div < 1] = 1 # Avoid division by 0 + var /= div + + # Set appropriate NaN elements + # (since we avoided 0-division) + var[(n - ddof) == 0] = np.nan + + return var + + +@_dask_cudf_performance_tracking +def _finalize_gb_agg( + gb_in, + gb_cols, + aggs, + columns, + final_columns, + as_index, + dropna, + sort, + sep, + str_cols_out, + aggs_renames, +): + """Final aggregation task. + + This is the final operation on each output partitions + of the `groupby_agg` algorithm. This function must + take care of higher-order aggregations, like "mean", + "std" and "var". We also need to deal with the column + index, the row index, and final sorting behavior. + """ + + gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) + + # Deal with higher-order aggregations + for col in columns: + agg_list = aggs.get(col, []) + agg_set = set(agg_list) + if agg_set.intersection({"mean", "std", "var"}): + count_name = _make_name((col, "count"), sep=sep) + sum_name = _make_name((col, "sum"), sep=sep) + if agg_set.intersection({"std", "var"}): + pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) + var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) + if "var" in agg_list: + name_var = _make_name((col, "var"), sep=sep) + gb[name_var] = var + if "std" in agg_list: + name_std = _make_name((col, "std"), sep=sep) + gb[name_std] = np.sqrt(var) + gb.drop(columns=[pow2_sum_name], inplace=True) + if "mean" in agg_list: + mean_name = _make_name((col, "mean"), sep=sep) + gb[mean_name] = gb[sum_name] / gb[count_name] + if "sum" not in agg_list: + gb.drop(columns=[sum_name], inplace=True) + if "count" not in agg_list: + gb.drop(columns=[count_name], inplace=True) + if list in agg_list: + collect_name = _make_name((col, "list"), sep=sep) + gb[collect_name] = gb[collect_name].list.concat() + + # Ensure sorted keys if `sort=True` + if sort: + gb = gb.sort_values(gb_cols) + + # Set index if necessary + if as_index: + gb.set_index(gb_cols, inplace=True) + + # Unflatten column names + col_array = [] + agg_array = [] + for col in gb.columns: + if col in gb_cols: + col_array.append(col) + agg_array.append("") + else: + name, agg = col.split(sep) + col_array.append(name) + agg_array.append(aggs_renames.get((name, agg), agg)) + if str_cols_out: + gb.columns = col_array + else: + gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + + return gb[final_columns] + + +@_dask_cudf_performance_tracking +def _redirect_aggs(arg): + """Redirect aggregations to their corresponding name in cuDF""" + redirects = { + sum: "sum", + max: "max", + min: "min", + "collect": list, + "list": list, + } + if isinstance(arg, dict): + new_arg = dict() + for col in arg: + if isinstance(arg[col], list): + new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] + elif isinstance(arg[col], dict): + new_arg[col] = { + k: redirects.get(v, v) for k, v in arg[col].items() + } + else: + new_arg[col] = redirects.get(arg[col], arg[col]) + return new_arg + if isinstance(arg, list): + return [redirects.get(agg, agg) for agg in arg] + return redirects.get(arg, arg) + + +@_dask_cudf_performance_tracking +def _aggs_optimized(arg, supported: set): + """Check that aggregations in `arg` are a subset of `supported`""" + if isinstance(arg, (list, dict)): + if isinstance(arg, dict): + _global_set: set[str] = set() + for col in arg: + if isinstance(arg[col], list): + _global_set = _global_set.union(set(arg[col])) + elif isinstance(arg[col], dict): + _global_set = _global_set.union(set(arg[col].values())) + else: + _global_set.add(arg[col]) + else: + _global_set = set(arg) + + return bool(_global_set.issubset(supported)) + elif isinstance(arg, (str, type)): + return arg in supported + return False + def _get_spec_info(gb): if isinstance(gb.arg, (dict, list)): @@ -105,20 +357,14 @@ def shuffle_by_index(self): @classmethod def chunk(cls, df, *by, **kwargs): - from dask_cudf._legacy.groupby import _groupby_partition_agg - return _groupby_partition_agg(df, **kwargs) @classmethod def combine(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _tree_node_agg - return _tree_node_agg(_concat(inputs), **kwargs) @classmethod def aggregate(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _finalize_gb_agg - return _finalize_gb_agg(_concat(inputs), **kwargs) @property @@ -193,12 +439,6 @@ def _maybe_get_custom_expr( shuffle_method=None, **kwargs, ): - from dask_cudf._legacy.groupby import ( - OPTIMIZED_AGGS, - _aggs_optimized, - _redirect_aggs, - ) - if kwargs: # Unsupported key-word arguments return None diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py deleted file mode 100644 index d6beb775a5e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/core.py +++ /dev/null @@ -1,711 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import math -import warnings - -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname - -import cudf -from cudf import _lib as libcudf -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._expr.accessors import ListMethods, StructMethods -from dask_cudf._legacy import sorting -from dask_cudf._legacy.sorting import ( - _deprecate_shuffle_kwarg, - _get_shuffle_method, -) - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf._legacy.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf._legacy.io.to_orc""" - from dask_cudf._legacy.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, - ) - - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py deleted file mode 100644 index 7e01e91476d..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/groupby.py +++ /dev/null @@ -1,909 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from functools import wraps - -import numpy as np -import pandas as pd - -from dask.dataframe.core import ( - DataFrame as DaskDataFrame, - aca, - split_out_on_cols, -) -from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy -from dask.utils import funcname - -import cudf -from cudf.core.groupby.groupby import _deprecate_collect -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg - -# aggregations that are dask-cudf optimized -OPTIMIZED_AGGS = ( - "count", - "mean", - "std", - "var", - "sum", - "min", - "max", - list, - "first", - "last", -) - - -def _check_groupby_optimized(func): - """ - Decorator for dask-cudf's groupby methods that returns the dask-cudf - optimized method if the groupby object is supported, otherwise - reverting to the upstream Dask method - """ - - @wraps(func) - def wrapper(*args, **kwargs): - gb = args[0] - if _groupby_optimized(gb): - return func(*args, **kwargs) - # note that we use upstream Dask's default kwargs for this call if - # none are specified; this shouldn't be an issue as those defaults are - # consistent with dask-cudf - return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs) - - return wrapper - - -class CudfDataFrameGroupBy(DataFrameGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - def __getitem__(self, key): - if isinstance(key, list): - g = CudfDataFrameGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - else: - g = CudfSeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - - g._meta = g._meta[key] - return g - - @_dask_cudf_performance_tracking - def _make_groupby_method_aggs(self, agg_name): - """Create aggs dictionary for aggregation methods""" - - if isinstance(self.by, list): - return {c: agg_name for c in self.obj.columns if c not in self.by} - return {c: agg_name for c in self.obj.columns if c != self.by} - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("count"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("mean"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("std"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("var"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("sum"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("min"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("max"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs(list), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("first"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("last"), - split_every, - split_out, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - if isinstance(self._meta.grouping.keys, cudf.MultiIndex): - keys = self._meta.grouping.keys.names - else: - keys = self._meta.grouping.keys.name - - return groupby_agg( - self.obj, - keys, - arg, - split_every=split_every, - split_out=split_out, - sep=self.sep, - sort=self.sort, - as_index=self.as_index, - shuffle_method=shuffle_method, - **self.dropna, - ) - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -class CudfSeriesGroupBy(SeriesGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "count"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "mean"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "std"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "var"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "sum"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "min"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "max"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - {self._slice: list}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "first"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "last"}, - split_every, - split_out, - )[self._slice] - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if not isinstance(arg, dict): - arg = {self._slice: arg} - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - return _make_groupby_agg_call( - self, arg, split_every, split_out, shuffle_method - )[self._slice] - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -def _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token=None, - sort=None, - shuffle_method=None, -): - # Shuffle-based groupby aggregation - # NOTE: This function is the dask_cudf version of - # dask.dataframe.groupby._shuffle_aggregate - - # Step 1 - Chunkwise groupby operation - chunk_name = f"{token or funcname(chunk)}-chunk" - chunked = ddf.map_partitions( - chunk, - meta=chunk(ddf._meta, **chunk_kwargs), - token=chunk_name, - **chunk_kwargs, - ) - - # Step 2 - Perform global sort or shuffle - shuffle_npartitions = max( - chunked.npartitions // split_every, - split_out, - ) - if sort and split_out > 1: - # Sort-based code path - result = ( - chunked.repartition(npartitions=shuffle_npartitions) - .sort_values( - gb_cols, - ignore_index=True, - shuffle_method=shuffle_method, - ) - .map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - ) - else: - # Hash-based code path - result = chunked.shuffle( - gb_cols, - npartitions=shuffle_npartitions, - ignore_index=True, - shuffle_method=shuffle_method, - ).map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - - # Step 3 - Repartition and return - if split_out < result.npartitions: - return result.repartition(npartitions=split_out) - return result - - -@_dask_cudf_performance_tracking -def groupby_agg( - ddf, - gb_cols, - aggs_in, - split_every=None, - split_out=None, - dropna=True, - sep="___", - sort=False, - as_index=True, - shuffle_method=None, -): - """Optimized groupby aggregation for Dask-CuDF. - - Parameters - ---------- - ddf : DataFrame - DataFrame object to perform grouping on. - gb_cols : str or list[str] - Column names to group by. - aggs_in : str, list, or dict - Aggregations to perform. - split_every : int (optional) - How to group intermediate aggregates. - dropna : bool - Drop grouping key values corresponding to NA values. - as_index : bool - Currently ignored. - sort : bool - Sort the group keys, better performance is obtained when - not sorting. - shuffle_method : str (optional) - Control how shuffling of the DataFrame is performed. - sep : str - Internal usage. - - - Notes - ----- - This "optimized" approach is more performant than the algorithm in - implemented in :meth:`DataFrame.apply` because it allows the cuDF - backend to perform multiple aggregations at once. - - This aggregation algorithm only supports the following options - - * "list" - * "count" - * "first" - * "last" - * "max" - * "mean" - * "min" - * "std" - * "sum" - * "var" - - - See Also - -------- - DataFrame.groupby : generic groupby of a DataFrame - dask.dataframe.apply_concat_apply : for more description of the - split_every argument. - - """ - # Assert that aggregations are supported - aggs = _redirect_aggs(aggs_in) - if not _aggs_optimized(aggs, OPTIMIZED_AGGS): - raise ValueError( - f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. " - f"Aggregations must be specified with dict or list syntax." - ) - - # If split_every is False, we use an all-to-one reduction - if split_every is False: - split_every = max(ddf.npartitions, 2) - - # Deal with default split_out and split_every params - split_every = split_every or 8 - split_out = split_out or 1 - - # Standardize `gb_cols`, `columns`, and `aggs` - if isinstance(gb_cols, str): - gb_cols = [gb_cols] - columns = [c for c in ddf.columns if c not in gb_cols] - if not isinstance(aggs, dict): - aggs = {col: aggs for col in columns} - - # Assert if our output will have a MultiIndex; this will be the case if - # any value in the `aggs` dict is not a string (i.e. multiple/named - # aggregations per column) - str_cols_out = True - aggs_renames = {} - for col in aggs: - if isinstance(aggs[col], str) or callable(aggs[col]): - aggs[col] = [aggs[col]] - elif isinstance(aggs[col], dict): - str_cols_out = False - col_aggs = [] - for k, v in aggs[col].items(): - aggs_renames[col, v] = k - col_aggs.append(v) - aggs[col] = col_aggs - else: - str_cols_out = False - if col in gb_cols: - columns.append(col) - - # Construct meta - _aggs = aggs.copy() - if str_cols_out: - # Metadata should use `str` for dict values if that is - # what the user originally specified (column names will - # be str, rather than tuples). - for col in aggs: - _aggs[col] = _aggs[col][0] - _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) - if aggs_renames: - col_array = [] - agg_array = [] - for col, agg in _meta.columns: - col_array.append(col) - agg_array.append(aggs_renames.get((col, agg), agg)) - _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - chunk = _groupby_partition_agg - chunk_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - combine = _tree_node_agg - combine_kwargs = { - "gb_cols": gb_cols, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - aggregate = _finalize_gb_agg - aggregate_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "final_columns": _meta.columns, - "as_index": as_index, - "dropna": dropna, - "sort": sort, - "sep": sep, - "str_cols_out": str_cols_out, - "aggs_renames": aggs_renames, - } - - # Use shuffle_method=True for split_out>1 - if sort and split_out > 1 and shuffle_method is None: - shuffle_method = "tasks" - - # Check if we are using the shuffle-based algorithm - if shuffle_method: - # Shuffle-based aggregation - return _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token="cudf-aggregate", - sort=sort, - shuffle_method=shuffle_method - if isinstance(shuffle_method, str) - else None, - ) - - # Deal with sort/shuffle defaults - if split_out > 1 and sort: - raise ValueError( - "dask-cudf's groupby algorithm does not yet support " - "`sort=True` when `split_out>1`, unless a shuffle-based " - "algorithm is used. Please use `split_out=1`, group " - "with `sort=False`, or set `shuffle_method=True`." - ) - - # Determine required columns to enable column projection - required_columns = list( - set(gb_cols).union(aggs.keys()).intersection(ddf.columns) - ) - - return aca( - [ddf[required_columns]], - chunk=chunk, - chunk_kwargs=chunk_kwargs, - combine=combine, - combine_kwargs=combine_kwargs, - aggregate=aggregate, - aggregate_kwargs=aggregate_kwargs, - token="cudf-aggregate", - split_every=split_every, - split_out=split_out, - split_out_setup=split_out_on_cols, - split_out_setup_kwargs={"cols": gb_cols}, - sort=sort, - ignore_index=True, - ) - - -@_dask_cudf_performance_tracking -def _make_groupby_agg_call( - gb, aggs, split_every, split_out, shuffle_method=None -): - """Helper method to consolidate the common `groupby_agg` call for all - aggregations in one place - """ - - return groupby_agg( - gb.obj, - gb.by, - aggs, - split_every=split_every, - split_out=split_out, - sep=gb.sep, - sort=gb.sort, - as_index=gb.as_index, - shuffle_method=shuffle_method, - **gb.dropna, - ) - - -@_dask_cudf_performance_tracking -def _redirect_aggs(arg): - """Redirect aggregations to their corresponding name in cuDF""" - redirects = { - sum: "sum", - max: "max", - min: "min", - "collect": list, - "list": list, - } - if isinstance(arg, dict): - new_arg = dict() - for col in arg: - if isinstance(arg[col], list): - new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] - elif isinstance(arg[col], dict): - new_arg[col] = { - k: redirects.get(v, v) for k, v in arg[col].items() - } - else: - new_arg[col] = redirects.get(arg[col], arg[col]) - return new_arg - if isinstance(arg, list): - return [redirects.get(agg, agg) for agg in arg] - return redirects.get(arg, arg) - - -@_dask_cudf_performance_tracking -def _aggs_optimized(arg, supported: set): - """Check that aggregations in `arg` are a subset of `supported`""" - if isinstance(arg, (list, dict)): - if isinstance(arg, dict): - _global_set: set[str] = set() - for col in arg: - if isinstance(arg[col], list): - _global_set = _global_set.union(set(arg[col])) - elif isinstance(arg[col], dict): - _global_set = _global_set.union(set(arg[col].values())) - else: - _global_set.add(arg[col]) - else: - _global_set = set(arg) - - return bool(_global_set.issubset(supported)) - elif isinstance(arg, (str, type)): - return arg in supported - return False - - -@_dask_cudf_performance_tracking -def _groupby_optimized(gb): - """Check that groupby input can use dask-cudf optimized codepath""" - return isinstance(gb.obj, DaskDataFrame) and ( - isinstance(gb.by, str) - or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by)) - ) - - -def _make_name(col_name, sep="_"): - """Combine elements of `col_name` into a single string, or no-op if - `col_name` is already a string - """ - if isinstance(col_name, str): - return col_name - return sep.join(name for name in col_name if name != "") - - -@_dask_cudf_performance_tracking -def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): - """Initial partition-level aggregation task. - - This is the first operation to be executed on each input - partition in `groupby_agg`. Depending on `aggs`, four possible - groupby aggregations ("count", "sum", "min", and "max") are - performed. The result is then partitioned (by hashing `gb_cols`) - into a number of distinct dictionary elements. The number of - elements in the output dictionary (`split_out`) corresponds to - the number of partitions in the final output of `groupby_agg`. - """ - - # Modify dict for initial (partition-wise) aggregations - _agg_dict = {} - for col, agg_list in aggs.items(): - _agg_dict[col] = set() - for agg in agg_list: - if agg in ("mean", "std", "var"): - _agg_dict[col].add("count") - _agg_dict[col].add("sum") - else: - _agg_dict[col].add(agg) - _agg_dict[col] = list(_agg_dict[col]) - if set(agg_list).intersection({"std", "var"}): - pow2_name = _make_name((col, "pow2"), sep=sep) - df[pow2_name] = df[col].astype("float64").pow(2) - _agg_dict[pow2_name] = ["sum"] - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - _agg_dict - ) - output_columns = [_make_name(name, sep=sep) for name in gb.columns] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _tree_node_agg(df, gb_cols, dropna, sort, sep): - """Node in groupby-aggregation reduction tree. - - The input DataFrame (`df`) corresponds to the - concatenated output of one or more `_groupby_partition_agg` - tasks. In this function, "sum", "min" and/or "max" groupby - aggregations will be used to combine the statistics for - duplicate keys. - """ - - agg_dict = {} - for col in df.columns: - if col in gb_cols: - continue - agg = col.split(sep)[-1] - if agg in ("count", "sum"): - agg_dict[col] = ["sum"] - elif agg == "list": - agg_dict[col] = [list] - elif agg in OPTIMIZED_AGGS: - agg_dict[col] = [agg] - else: - raise ValueError(f"Unexpected aggregation: {agg}") - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - agg_dict - ) - - # Don't include the last aggregation in the column names - output_columns = [ - _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) - for name in gb.columns - ] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): - """Calculate variance (given count, sum, and sum-squared columns).""" - - # Select count, sum, and sum-squared - n = df[count_name] - x = df[sum_name] - x2 = df[pow2_sum_name] - - # Use sum-squared approach to get variance - var = x2 - x**2 / n - div = n - ddof - div[div < 1] = 1 # Avoid division by 0 - var /= div - - # Set appropriate NaN elements - # (since we avoided 0-division) - var[(n - ddof) == 0] = np.nan - - return var - - -@_dask_cudf_performance_tracking -def _finalize_gb_agg( - gb_in, - gb_cols, - aggs, - columns, - final_columns, - as_index, - dropna, - sort, - sep, - str_cols_out, - aggs_renames, -): - """Final aggregation task. - - This is the final operation on each output partitions - of the `groupby_agg` algorithm. This function must - take care of higher-order aggregations, like "mean", - "std" and "var". We also need to deal with the column - index, the row index, and final sorting behavior. - """ - - gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) - - # Deal with higher-order aggregations - for col in columns: - agg_list = aggs.get(col, []) - agg_set = set(agg_list) - if agg_set.intersection({"mean", "std", "var"}): - count_name = _make_name((col, "count"), sep=sep) - sum_name = _make_name((col, "sum"), sep=sep) - if agg_set.intersection({"std", "var"}): - pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) - var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) - if "var" in agg_list: - name_var = _make_name((col, "var"), sep=sep) - gb[name_var] = var - if "std" in agg_list: - name_std = _make_name((col, "std"), sep=sep) - gb[name_std] = np.sqrt(var) - gb.drop(columns=[pow2_sum_name], inplace=True) - if "mean" in agg_list: - mean_name = _make_name((col, "mean"), sep=sep) - gb[mean_name] = gb[sum_name] / gb[count_name] - if "sum" not in agg_list: - gb.drop(columns=[sum_name], inplace=True) - if "count" not in agg_list: - gb.drop(columns=[count_name], inplace=True) - if list in agg_list: - collect_name = _make_name((col, "list"), sep=sep) - gb[collect_name] = gb[collect_name].list.concat() - - # Ensure sorted keys if `sort=True` - if sort: - gb = gb.sort_values(gb_cols) - - # Set index if necessary - if as_index: - gb.set_index(gb_cols, inplace=True) - - # Unflatten column names - col_array = [] - agg_array = [] - for col in gb.columns: - if col in gb_cols: - col_array.append(col) - agg_array.append("") - else: - name, agg = col.split(sep) - col_array.append(name) - agg_array.append(aggs_renames.get((name, agg), agg)) - if str_cols_out: - gb.columns = col_array - else: - gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - return gb[final_columns] diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py index 0421bd755f4..c544c32523f 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -1,11 +1 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from .csv import read_csv # noqa: F401 -from .json import read_json # noqa: F401 -from .orc import read_orc, to_orc # noqa: F401 -from .text import read_text # noqa: F401 - -try: - from .parquet import read_parquet, to_parquet # noqa: F401 -except ImportError: - pass +# Copyright (c) 2018-2025, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py deleted file mode 100644 index fa5400344f9..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import os -from glob import glob -from warnings import warn - -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py deleted file mode 100644 index 98c5ceedb76..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/json.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from functools import partial - -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py deleted file mode 100644 index fcf684fd6c8..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(source, fs, columns=None, kwargs=None): - """Pull out specific columns from specific stripe""" - path, stripe = source - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, _, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - sources = [] - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - sources.append((path, stripe)) - - return dd.from_map( - _read_orc_stripe, - sources, - args=[fs], - columns=columns, - kwargs=kwargs, - meta=meta, - ) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0638e4a1c3..c0792663c7e 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools import warnings from functools import partial @@ -8,7 +8,7 @@ import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq -from dask import dataframe as dd +import dask.dataframe as dd from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine try: @@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet(path, columns=None, **kwargs): - """ - Read parquet files into a :class:`.DataFrame`. - - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. - - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. - - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP - - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: - - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP - - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: - - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP - - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] - - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) - if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) - ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size - - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) - - -to_parquet = partial(dd.to_parquet, engine=CudfEngine) +to_parquet = dd.to_parquet if create_metadata_file_dd is None: create_metadata_file = create_metadata_file_dd diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py deleted file mode 100644 index 3757c85c80c..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/text.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import os -from glob import glob - -import dask.dataframe as dd -from dask.utils import parse_bytes - -import cudf - - -def _read_text(source, **kwargs): - # Wrapper for cudf.read_text operation - fn, byte_range = source - return cudf.read_text(fn, byte_range=byte_range, **kwargs) - - -def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - if chunksize and byte_range: - raise ValueError("Cannot specify both chunksize and byte_range.") - - if chunksize: - sources = [] - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - byte_range = ( - start, - chunksize, - ) # specify which chunk of the file we care about - sources.append((fn, byte_range)) - else: - sources = [(fn, byte_range) for fn in filenames] - - return dd.from_map( - _read_text, - sources, - meta=cudf.Series([], dtype="O"), - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py deleted file mode 100644 index a2ba4d1878e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/sorting.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings -from collections.abc import Iterator -from functools import wraps - -import cupy -import numpy as np -import tlz as toolz - -from dask import config -from dask.base import tokenize -from dask.dataframe import methods -from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M - -import cudf -from cudf.api.types import _is_categorical_dtype -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -_SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported - - -def _deprecate_shuffle_kwarg(func): - @wraps(func) - def wrapper(*args, **kwargs): - old_arg_value = kwargs.pop("shuffle", None) - - if old_arg_value is not None: - new_arg_value = old_arg_value - msg = ( - "the 'shuffle' keyword is deprecated, " - "use 'shuffle_method' instead." - ) - - warnings.warn(msg, FutureWarning) - if kwargs.get("shuffle_method") is not None: - msg = ( - "Can only specify 'shuffle' " - "or 'shuffle_method', not both." - ) - raise TypeError(msg) - kwargs["shuffle_method"] = new_arg_value - return func(*args, **kwargs) - - return wrapper - - -@_dask_cudf_performance_tracking -def set_index_post(df, index_name, drop, column_dtype): - df2 = df.set_index(index_name, drop=drop) - df2.columns = df2.columns.astype(column_dtype) - return df2 - - -@_dask_cudf_performance_tracking -def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): - if ascending: - partitions = divisions.searchsorted(s, side="right") - 1 - else: - partitions = ( - len(divisions) - divisions.searchsorted(s, side="right") - 1 - ) - partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( - 0 if ascending else (len(divisions) - 2) - ) - partitions[s._columns[0].isnull().values] = ( - len(divisions) - 2 if na_position == "last" else 0 - ) - return partitions - - -@_dask_cudf_performance_tracking -def _quantile(a, q): - n = len(a) - if not len(a): - return None, n - return ( - a.quantile(q=q.tolist(), interpolation="nearest", method="table"), - n, - ) - - -@_dask_cudf_performance_tracking -def merge_quantiles(finalq, qs, vals): - """Combine several quantile calculations of different data. - [NOTE: Same logic as dask.array merge_percentiles] - """ - if isinstance(finalq, Iterator): - finalq = list(finalq) - finalq = np.array(finalq) - qs = list(map(list, qs)) - vals = list(vals) - vals, Ns = zip(*vals) - Ns = list(Ns) - - L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) - if not L: - raise ValueError("No non-trivial arrays found") - qs, vals, Ns = L - - if len(vals) != len(qs) or len(Ns) != len(qs): - raise ValueError("qs, vals, and Ns parameters must be the same length") - - # transform qs and Ns into number of observations between quantiles - counts = [] - for q, N in zip(qs, Ns): - count = np.empty(len(q)) - count[1:] = np.diff(q) - count[0] = q[0] - count *= N - counts.append(count) - - def _append_counts(val, count): - val["_counts"] = count - return val - - # Sort by calculated quantile values, then number of observations. - combined_vals_counts = cudf.core.reshape._merge_sorted( - [*map(_append_counts, vals, counts)] - ) - combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) - combined_vals = combined_vals_counts.drop(columns=["_counts"]) - - # quantile-like, but scaled by total number of observations - combined_q = np.cumsum(combined_counts) - - # rescale finalq quantiles to match combined_q - desired_q = finalq * sum(Ns) - - # TODO: Support other interpolation methods - # For now - Always use "nearest" for interpolation - left = np.searchsorted(combined_q, desired_q, side="left") - right = np.searchsorted(combined_q, desired_q, side="right") - 1 - np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index - lower = np.minimum(left, right) - upper = np.maximum(left, right) - lower_residual = np.abs(combined_q[lower] - desired_q) - upper_residual = np.abs(combined_q[upper] - desired_q) - mask = lower_residual > upper_residual - index = lower # alias; we no longer need lower - index[mask] = upper[mask] - rv = combined_vals.iloc[index] - return rv.reset_index(drop=True) - - -@_dask_cudf_performance_tracking -def _approximate_quantile(df, q): - """Approximate quantiles of DataFrame or Series. - [NOTE: Same logic as dask.dataframe Series quantile] - """ - # current implementation needs q to be sorted so - # sort if array-like, otherwise leave it alone - q_ndarray = np.array(q) - if q_ndarray.ndim > 0: - q_ndarray.sort(kind="mergesort") - q = q_ndarray - - # Lets assume we are dealing with a DataFrame throughout - if isinstance(df, (Series, Index)): - df = df.to_frame() - assert isinstance(df, DataFrame) - final_type = df._meta._constructor - - # Create metadata - meta = df._meta_nonempty.quantile(q=q, method="table") - - # Define final action (create df with quantiles as index) - def finalize_tsk(tsk): - return (final_type, tsk) - - return_type = df.__class__ - - # pandas/cudf uses quantile in [0, 1] - # numpy / cupy uses [0, 100] - qs = np.asarray(q) - token = tokenize(df, qs) - - if len(qs) == 0: - name = "quantiles-" + token - empty_index = cudf.Index([], dtype=float) - return Series( - { - (name, 0): final_type( - {col: [] for col in df.columns}, - name=df.name, - index=empty_index, - ) - }, - name, - df._meta, - [None, None], - ) - else: - new_divisions = [np.min(q), np.max(q)] - - name = "quantiles-1-" + token - val_dsk = { - (name, i): (_quantile, key, qs) - for i, key in enumerate(df.__dask_keys__()) - } - - name2 = "quantiles-2-" + token - merge_dsk = { - (name2, 0): finalize_tsk( - (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)) - ) - } - dsk = toolz.merge(val_dsk, merge_dsk) - graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) - df = return_type(graph, name2, meta, new_divisions) - - def set_quantile_index(df): - df.index = q - return df - - df = df.map_partitions(set_quantile_index, meta=meta) - return df - - -@_dask_cudf_performance_tracking -def quantile_divisions(df, by, npartitions): - qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() - divisions = _approximate_quantile(df[by], qn).compute() - columns = divisions.columns - - # TODO: Make sure divisions are correct for all dtypes.. - if ( - len(columns) == 1 - and df[columns[0]].dtype != "object" - and not _is_categorical_dtype(df[columns[0]].dtype) - ): - dtype = df[columns[0]].dtype - divisions = divisions[columns[0]].astype("int64") - divisions.iloc[-1] += 1 - divisions = sorted( - divisions.drop_duplicates().astype(dtype).to_arrow().tolist(), - key=lambda x: (x is None, x), - ) - else: - for col in columns: - dtype = df[col].dtype - if dtype != "object": - divisions[col] = divisions[col].astype("int64") - divisions[col].iloc[-1] += 1 - divisions[col] = divisions[col].astype(dtype) - else: - if last := divisions[col].iloc[-1]: - val = chr(ord(last[0]) + 1) - else: - val = "this string intentionally left empty" # any but "" - divisions[col].iloc[-1] = val - divisions = divisions.drop_duplicates().sort_index() - return divisions - - -@_deprecate_shuffle_kwarg -@_dask_cudf_performance_tracking -def sort_values( - df, - by, - max_branch=None, - divisions=None, - set_divisions=False, - ignore_index=False, - ascending=True, - na_position="last", - shuffle_method=None, - sort_function=None, - sort_function_kwargs=None, -): - """Sort by the given list/tuple of column names.""" - - if not isinstance(ascending, bool): - raise ValueError("ascending must be either True or False") - if na_position not in ("first", "last"): - raise ValueError("na_position must be either 'first' or 'last'") - - npartitions = df.npartitions - if isinstance(by, tuple): - by = list(by) - elif not isinstance(by, list): - by = [by] - - # parse custom sort function / kwargs if provided - sort_kwargs = { - "by": by, - "ascending": ascending, - "na_position": na_position, - } - if sort_function is None: - sort_function = M.sort_values - if sort_function_kwargs is not None: - sort_kwargs.update(sort_function_kwargs) - - # handle single partition case - if npartitions == 1: - return df.map_partitions(sort_function, **sort_kwargs) - - # Step 1 - Calculate new divisions (if necessary) - if divisions is None: - divisions = quantile_divisions(df, by, npartitions) - - # Step 2 - Perform repartitioning shuffle - meta = df._meta._constructor_sliced([0]) - if not isinstance(divisions, (cudf.Series, cudf.DataFrame)): - dtype = df[by[0]].dtype - divisions = df._meta._constructor_sliced(divisions, dtype=dtype) - - partitions = df[by].map_partitions( - _set_partitions_pre, - divisions=divisions, - ascending=ascending, - na_position=na_position, - meta=meta, - ) - - df2 = df.assign(_partitions=partitions) - df3 = rearrange_by_column( - df2, - "_partitions", - max_branch=max_branch, - npartitions=len(divisions) - 1, - shuffle_method=_get_shuffle_method(shuffle_method), - ignore_index=ignore_index, - ).drop(columns=["_partitions"]) - df3.divisions = (None,) * (df3.npartitions + 1) - - # Step 3 - Return final sorted df - df4 = df3.map_partitions(sort_function, **sort_kwargs) - if not isinstance(divisions, cudf.DataFrame) and set_divisions: - # Can't have multi-column divisions elsewhere in dask (yet) - df4.divisions = tuple(methods.tolist(divisions)) - - return df4 - - -def get_default_shuffle_method(): - # Note that `dask.utils.get_default_shuffle_method` - # will return "p2p" by default when a distributed - # client is present. Dask-cudf supports "p2p", but - # will not use it by default (yet) - default = config.get("dataframe.shuffle.method", "tasks") - if default not in _SHUFFLE_SUPPORT: - default = "tasks" - return default - - -def _get_shuffle_method(shuffle_method): - # Utility to set the shuffle_method-kwarg default - # and to validate user-specified options - shuffle_method = shuffle_method or get_default_shuffle_method() - if shuffle_method not in _SHUFFLE_SUPPORT: - raise ValueError( - "Dask-cudf only supports the following shuffle " - f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}" - ) - - return shuffle_method diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index fceaaf185e8..f33733d9583 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -11,14 +11,12 @@ from packaging.version import Version from pandas.api.types import is_scalar -import dask.dataframe as dd from dask import config from dask.array.dispatch import percentile_lookup from dask.dataframe.backends import ( DataFrameBackendEntrypoint, PandasBackendEntrypoint, ) -from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( categorical_dtype_dispatch, concat_dispatch, @@ -28,6 +26,8 @@ hash_object_dispatch, is_categorical_dtype_dispatch, make_meta_dispatch, + meta_nonempty, + partd_encode_dispatch, pyarrow_schema_dispatch, to_pyarrow_table_dispatch, tolist_dispatch, @@ -46,13 +46,6 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from ._legacy.core import DataFrame, Index, Series - -get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) -get_parallel_type.register(cudf.Series, lambda _: Series) -get_parallel_type.register(cudf.BaseIndex, lambda _: Index) - - # Required for Arrow filesystem support in read_parquet PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") @@ -318,7 +311,7 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( - (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) + (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype) # , Series) ) @_dask_cudf_performance_tracking def is_categorical_dtype_cudf(obj): @@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj): return obj.memory_usage() -# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0 -try: - from dask.dataframe.dispatch import partd_encode_dispatch - - @partd_encode_dispatch.register(cudf.DataFrame) - def _simple_cudf_encode(_): - # Basic pickle-based encoding for a partd k-v store - import pickle +@partd_encode_dispatch.register(cudf.DataFrame) +def _simple_cudf_encode(_): + # Basic pickle-based encoding for a partd k-v store + import pickle - import partd + import partd - def join(dfs): - if not dfs: - return cudf.DataFrame() - else: - return cudf.concat(dfs) - - dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) - return partial(partd.Encode, dumps, pickle.loads, join) + def join(dfs): + if not dfs: + return cudf.DataFrame() + else: + return cudf.concat(dfs) -except ImportError: - pass + dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) + return partial(partd.Encode, dumps, pickle.loads, join) def _default_backend(func, *args, **kwargs): @@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs): return data -# Define "cudf" backend engine to be registered with Dask -class CudfBackendEntrypoint(DataFrameBackendEntrypoint): - """Backend-entrypoint class for Dask-DataFrame +# Define the "cudf" backend for "legacy" Dask DataFrame +class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint): + """Backend-entrypoint class for legacy Dask-DataFrame This class is registered under the name "cudf" for the - ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. - Dask-DataFrame will use the methods defined in this class - in place of ``dask.dataframe.`` when the - "dataframe.backend" configuration is set to "cudf": - - Examples - -------- - >>> import dask - >>> import dask.dataframe as dd - >>> with dask.config.set({"dataframe.backend": "cudf"}): - ... ddf = dd.from_dict({"a": range(10)}) - >>> type(ddf) - + ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``. + This "legacy" backend is only used for CSV support. """ - @classmethod - def to_backend_dispatch(cls): - return to_cudf_dispatch - - @classmethod - def to_backend(cls, data: dd.core._Frame, **kwargs): - if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)): - # Already a cudf-backed collection - _unsupported_kwargs("cudf", "cudf", kwargs) - return data - return data.map_partitions(cls.to_backend_dispatch(), **kwargs) - - @staticmethod - def from_dict( - data, - npartitions, - orient="columns", - dtype=None, - columns=None, - constructor=cudf.DataFrame, - ): - return _default_backend( - dd.from_dict, - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - constructor=constructor, - ) - - @staticmethod - def read_parquet(*args, engine=None, **kwargs): - from dask_cudf._legacy.io.parquet import CudfEngine - - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dd.read_parquet, - *args, - engine=CudfEngine, - **kwargs, - ) - - @staticmethod - def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json - - return read_json(*args, **kwargs) - @staticmethod - def read_orc(*args, **kwargs): - from dask_cudf._legacy.io import read_orc - - return read_orc(*args, **kwargs) - - @staticmethod - def read_csv(*args, **kwargs): - from dask_cudf._legacy.io import read_csv - - return read_csv(*args, **kwargs) - - @staticmethod - def read_hdf(*args, **kwargs): - # HDF5 reader not yet implemented in cudf - warnings.warn( - "read_hdf is not yet implemented in cudf/dask_cudf. " - "Moving to cudf from pandas. Expect poor performance!" - ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( - "cudf" - ) - - -# Define "cudf" backend entrypoint for dask-expr -class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): +# Define the "cudf" backend for expr-based Dask DataFrame +class CudfBackendEntrypoint(DataFrameBackendEntrypoint): """Backend-entrypoint class for Dask-Expressions This class is registered under the name "cudf" for the - ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``. + ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``. Dask-DataFrame will use the methods defined in this class in place of ``dask_expr.`` when the "dataframe.backend" configuration is set to "cudf": @@ -746,12 +649,12 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json as read_json_impl + from dask_cudf.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc + from dask_cudf.io.orc import read_orc as legacy_read_orc return legacy_read_orc(*args, **kwargs) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 5fd217209ec..32461104ef9 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,56 +1,41 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import textwrap +import warnings +from importlib import import_module import dask.dataframe as dd -from dask.tokenize import tokenize import cudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking # This module provides backward compatibility for legacy import patterns. -if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( - DataFrame, - Index, - Series, - ) -else: - from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - +from dask_cudf._expr.collection import ( + DataFrame, # noqa: F401 + Index, # noqa: F401 + Series, # noqa: F401 +) concat = dd.concat @_dask_cudf_performance_tracking def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): - from dask_cudf import QUERY_PLANNING_ON - if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( "dask_cudf does not support MultiIndex Dataframes." ) - # Dask-expr doesn't support the `name` argument - name = {} - if not QUERY_PLANNING_ON: - name = { - "name": name - or ("from_cudf-" + tokenize(data, npartitions or chunksize)) - } - return dd.from_pandas( data, npartitions=npartitions, chunksize=chunksize, sort=sort, - **name, ) -from_cudf.__doc__ = ( - textwrap.dedent( - """ +from_cudf.__doc__ = textwrap.dedent( + """ Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. This function is a thin wrapper around @@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): arguments (described below) excepting that it operates on cuDF rather than pandas objects.\n """ - ) - # TODO: `dd.from_pandas.__doc__` is empty when - # `DASK_DATAFRAME__QUERY_PLANNING=True` - # since dask-expr does not provide a docstring for from_pandas. - + textwrap.dedent(dd.from_pandas.__doc__ or "") -) +) + textwrap.dedent(dd.from_pandas.__doc__) + + +def _deprecated_api(old_api, new_api=None, rec=None): + def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error + raise NotImplementedError( + f"{old_api} is no longer supported. " + (rec or "") + ) + + return inner_func diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 9bca33e414a..a5175c9bbe7 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from dask_cudf.core import _deprecated_api from . import csv, json, orc, parquet, text # noqa: F401 @@ -15,20 +15,13 @@ ) to_orc = _deprecated_api( "dask_cudf.io.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.orc.to_orc", rec="Please use the DataFrame.to_orc method instead.", ) read_text = _deprecated_api( "dask_cudf.io.read_text", new_api="dask_cudf.read_text" ) -if QUERY_PLANNING_ON: - read_parquet = parquet.read_parquet -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = parquet.read_parquet to_parquet = _deprecated_api( "dask_cudf.io.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 29f98b14511..e36ee04d827 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import os from glob import glob @@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs): >>> import dask_cudf >>> df = dask_cudf.read_csv("myfiles.*.csv") - In some cases it can break up large files: + It can break up large files if blocksize is specified: >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - It can read CSV files from external resources (e.g. S3, HTTP, FTP) + It can read CSV files from external resources (e.g. S3, HTTP, FTP): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") @@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs): ---------- path : str, path object, or file-like object Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as + ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3 + locations), or any object with a ``read()`` method (such as builtin :py:func:`open` file handler function or :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to + Passthrough keyword arguments that are sent to :func:`cudf:cudf.read_csv`. Notes diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 8f85ea54c0a..3022ebb2a5b 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,8 +1,209 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +from functools import partial -read_json = _deprecated_api( - "dask_cudf.io.json.read_json", - new_api="dask_cudf.read_json", -) +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5219cdacc31..5de28751912 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,13 +1,195 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_cudf import _deprecated_api - -read_orc = _deprecated_api( - "dask_cudf.io.orc.read_orc", - new_api="dask_cudf.read_orc", -) -to_orc = _deprecated_api( - "dask_cudf.io.orc.to_orc", - new_api="dask_cudf._legacy.io.orc.to_orc", - rec="Please use the DataFrame.to_orc method instead.", -) +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(source, fs, columns=None, kwargs=None): + """Pull out specific columns from specific stripe""" + path, stripe = source + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, _, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + sources = [] + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + sources.append((path, stripe)) + + return dd.from_map( + _read_orc_stripe, + sources, + args=[fs], + columns=columns, + kwargs=kwargs, + meta=meta, + ) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba6209c4820..a953dce787d 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -37,10 +37,9 @@ def TaskList(*x): import cudf -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api - # Dask-expr imports CudfEngine from this module from dask_cudf._legacy.io.parquet import CudfEngine +from dask_cudf.core import _deprecated_api if TYPE_CHECKING: from collections.abc import MutableMapping @@ -832,15 +831,8 @@ def read_parquet_expr( ) -if QUERY_PLANNING_ON: - read_parquet = read_parquet_expr - read_parquet.__doc__ = read_parquet_expr.__doc__ -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.parquet.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = read_parquet_expr +read_parquet.__doc__ = read_parquet_expr.__doc__ to_parquet = _deprecated_api( "dask_cudf.io.parquet.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index f5509cf91c3..48eca13e16f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import math import os @@ -11,10 +11,6 @@ from dask.utils import tmpfile import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): @@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path): with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): df2 = dask_cudf.io.read_json(path) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): - df2 = dask_cudf.io.json.read_json(path) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index b6064d851ca..4aac463420b 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import glob import os @@ -12,10 +12,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") @@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): df2 = dask_cudf.io.read_orc(paths) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): - df2 = dask_cudf.io.orc.read_orc(paths) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6efe6c4f388..9f7031f4d2a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import glob import math @@ -16,11 +16,6 @@ import dask_cudf from dask_cudf._legacy.io.parquet import create_metadata_file -from dask_cudf.tests.utils import ( - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) # Check if create_metadata_file is supported by # the current dask.dataframe version @@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on): dd.assert_eq(ddf1, ddf2) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @need_create_meta def test_create_metadata_file_inconsistent_schema(tmpdir): # NOTE: This test demonstrates that the CudfEngine @@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir): dd.assert_eq(df, new_ddf) -@skip_dask_expr("Not necessary in dask-expr") -def test_check_file_size(tmpdir): - # Test simple file-size check to help warn users - # of upstream change to `split_row_groups` default - fn = str(tmpdir.join("test.parquet")) - cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) - with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf._legacy.io` path - # TODO: Remove outdated `check_file_size` functionality - dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() - - -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning @@ -626,7 +607,6 @@ def test_timezone_column(tmpdir): dd.assert_eq(got, expect) -@require_dask_expr() @pytest.mark.skipif( not dask_cudf.backends.PYARROW_GE_15, reason="Requires pyarrow 15", @@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): dask_cudf.io.to_parquet(df, tmpdir) - if dask_cudf.QUERY_PLANNING_ON: - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - else: - with pytest.warns(match="legacy dask_cudf.io.read_parquet"): - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) - with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"): - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index 90907f6fb99..7c53b89a883 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import os import socket @@ -14,7 +14,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises(): pytest.param( "arrow", marks=pytest.mark.skipif( - not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + not dask_cudf.backends.PYARROW_GE_15, reason="Not supported", ), ), diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e35b6411a9d..f4d59334e03 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import os @@ -9,10 +9,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") @@ -42,7 +38,3 @@ def test_deprecated_api_paths(): with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): df2 = dask_cudf.io.read_text(text_file, delimiter=".") dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): - df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 1caf4e81d8e..eb1d007cc16 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,8 +1,56 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +import os +from glob import glob -read_text = _deprecated_api( - "dask_cudf.io.text.read_text", - new_api="dask_cudf.read_text", -) +import dask.dataframe as dd +from dask.utils import parse_bytes + +import cudf + + +def _read_text(source, **kwargs): + # Wrapper for cudf.read_text operation + fn, byte_range = source + return cudf.read_text(fn, byte_range=byte_range, **kwargs) + + +def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + if chunksize and byte_range: + raise ValueError("Cannot specify both chunksize and byte_range.") + + if chunksize: + sources = [] + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + byte_range = ( + start, + chunksize, + ) # specify which chunk of the file we care about + sources.append((fn, byte_range)) + else: + sources = [(fn, byte_range) for fn in filenames] + + return dd.from_map( + _read_text, + sources, + meta=cudf.Series([], dtype="O"), + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 3fbb2aacd2c..c6b01a648eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,7 +13,6 @@ from cudf.testing._utils import does_not_raise import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr ############################################################################# # Datetime Accessor # @@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7101fb7e00a..31957a106ff 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import random @@ -9,18 +9,12 @@ import dask from dask import dataframe as dd -from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty +from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty from dask.utils import M import cudf import dask_cudf -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) rng = np.random.default_rng(seed=0) @@ -299,37 +293,6 @@ def test_set_index_sorted(): gddf1.set_index("val", sorted=True) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -@pytest.mark.parametrize("index", [None, "myindex"]) -def test_rearrange_by_divisions(nelem, index): - with dask.config.set(scheduler="single-threaded"): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 20, size=nelem), - "y": rng.normal(size=nelem), - "z": rng.choice(["dog", "cat", "bird"], nelem), - } - ) - df["z"] = df["z"].astype("category") - - ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf( - cudf.DataFrame.from_pandas(df), npartitions=4 - ) - ddf1.index.name = index - gdf1.index.name = index - divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) - - expect = dd.shuffle.rearrange_by_divisions( - ddf1, "x", divisions=divisions, shuffle_method="tasks" - ) - result = dd.shuffle.rearrange_by_divisions( - gdf1, "x", divisions=divisions, shuffle_method="tasks" - ) - dd.assert_eq(expect, result) - - def test_assign(): rng = np.random.default_rng(seed=0) df = pd.DataFrame( @@ -393,44 +356,6 @@ def test_setitem_scalar_datetime(): np.testing.assert_array_equal(got["z"], df["z"]) -@skip_dask_expr("Not relevant for dask-expr") -@pytest.mark.parametrize( - "func", - [ - lambda: pd.DataFrame( - {"A": rng.random(10), "B": rng.random(10)}, - index=list("abcdefghij"), - ), - lambda: pd.DataFrame( - { - "A": rng.random(10), - "B": list("a" * 10), - "C": pd.Series( - [str(20090101 + i) for i in range(10)], - dtype="datetime64[ns]", - ), - }, - index=list("abcdefghij"), - ), - lambda: pd.Series(list("abcdefghijklmnop")), - lambda: pd.Series( - rng.random(10), - index=pd.Index( - [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" - ), - ), - ], -) -def test_repr(func): - pdf = func() - gdf = cudf.from_pandas(pdf) - gddf = dd.from_pandas(gdf, npartitions=3, sort=False) - - assert repr(gddf) - if hasattr(pdf, "_repr_html_"): - assert gddf._repr_html_() - - @pytest.mark.skip(reason="datetime indexes not fully supported in cudf") @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"]) @pytest.mark.parametrize("stop", ["1d", "3d", "8h"]) @@ -657,20 +582,20 @@ def test_hash_object_dispatch(index): ) # DataFrame - result = dd.core.hash_object_dispatch(obj, index=index) + result = dd.dispatch.hash_object_dispatch(obj, index=index) expected = dask_cudf.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series - result = dd.core.hash_object_dispatch(obj["x"], index=index) + result = dd.dispatch.hash_object_dispatch(obj["x"], index=index) expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) - result = dd.core.hash_object_dispatch(obj_multi, index=index) + result = dd.dispatch.hash_object_dispatch(obj_multi, index=index) expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) @@ -784,7 +709,6 @@ def test_dataframe_set_index(): assert_eq(ddf.compute(), pddf.compute()) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_series_describe(): random.seed(0) sr = cudf.datasets.randomdata(20)["x"] @@ -800,7 +724,6 @@ def test_series_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_dataframe_describe(): random.seed(0) df = cudf.datasets.randomdata(20) @@ -814,7 +737,6 @@ def test_dataframe_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_zero_std_describe(): num = 84886781 df = cudf.DataFrame( @@ -864,7 +786,7 @@ def test_merging_categorical_columns(): ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) - ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) + ddf_1 = ddf_1.categorize(columns=["cat_col"]) df_2 = cudf.DataFrame( {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} @@ -872,7 +794,7 @@ def test_merging_categorical_columns(): ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) - ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) + ddf_2 = ddf_2.categorize(columns=["cat_col"]) expected = cudf.DataFrame( { @@ -932,14 +854,9 @@ def func(x): result = ds.map_partitions(func, meta=s.values) - if QUERY_PLANNING_ON: - # Check Array and round-tripped DataFrame - dask.array.assert_eq(result, func(s)) - dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) - else: - # Legacy version still carries numpy metadata - # See: https://github.com/dask/dask/issues/11017 - dask.array.assert_eq(result.compute(), func(s)) + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) def test_implicit_array_conversion_cupy_sparse(): @@ -981,7 +898,6 @@ def test_series_isin_error(): ddf.isin([1, 5, "a"]).compute() -@require_dask_expr() def test_to_backend_simplify(): # Check that column projection is not blocked by to_backend with dask.config.set({"dataframe.backend": "pandas"}): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 9bd3b506db0..11ca0c6a783 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,12 +13,7 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - xfail_dask_expr, -) +from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized def assert_cudf_groupby_layers(ddf): @@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf): expect = getattr(gdf_grouped, aggregation)() actual = getattr(ddf_grouped, aggregation)() - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) if not series: expect = gdf_grouped.agg({"x": aggregation}) actual = ddf_grouped.agg({"x": aggregation}) - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) @@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf): check_dtype = aggregation != "count" - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - - # groupby.agg should add an explicit getitem layer - # to improve/enable column projection - assert hlg_layer(actual.dask, "getitem") - dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype) @@ -556,20 +538,13 @@ def test_groupby_categorical_key(): True, pytest.param( False, - marks=xfail_dask_expr("as_index not supported in dask-expr"), - ), - ], -) -@pytest.mark.parametrize( - "fused", - [ - True, - pytest.param( - False, - marks=require_dask_expr("Not supported by legacy API"), + marks=pytest.mark.xfail( + reason="as_index not supported in dask-expr" + ), ), ], ) +@pytest.mark.parametrize("fused", [True, False]) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) @@ -590,19 +565,16 @@ def test_groupby_agg_params( "c": ["mean", "std", "var"], } - fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} + fused_kwarg = {"fused": fused} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") # Avoid using as_index when query-planning is enabled - if QUERY_PLANNING_ON: - with pytest.warns(FutureWarning, match="argument is now deprecated"): - # Should warn when `as_index` is used - ddf.groupby(["name", "a"], sort=False, as_index=as_index) - maybe_as_index = {"as_index": as_index} if as_index is False else {} - else: - maybe_as_index = {"as_index": as_index} + with pytest.warns(FutureWarning, match="argument is now deprecated"): + # Should warn when `as_index` is used + ddf.groupby(["name", "a"], sort=False, as_index=as_index) + maybe_as_index = {"as_index": as_index} if as_index is False else {} # Check `sort=True` behavior if split_out == 1: @@ -671,7 +643,6 @@ def test_groupby_agg_params( dd.assert_eq(gf, pf) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) @@ -711,7 +682,6 @@ def test_is_supported(arg, supported): assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) gdf = cudf.from_pandas(df) @@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg): ) -@xfail_dask_expr("Co-alignment check fails in dask-expr") +@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr") def test_groupby_with_list_of_series(): df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) gdf = dask_cudf.from_cudf(df, npartitions=2) @@ -773,7 +743,6 @@ def test_groupby_with_list_of_series(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "func", [ @@ -833,7 +802,7 @@ def test_groupby_all_columns(func): expect = func(ddf) actual = func(gddf) - dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) + dd.assert_eq(expect, actual, check_names=False) def test_groupby_shuffle(): @@ -870,15 +839,3 @@ def test_groupby_shuffle(): # NOTE: `shuffle_method=True` should be default got = gddf.groupby("a", sort=False).agg(spec, split_out=2) dd.assert_eq(expect, got.compute().sort_index()) - - if not QUERY_PLANNING_ON: - # Sorted aggregation fails with split_out>1 when shuffle is False - # (sort=True, split_out=2, shuffle_method=False) - with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg( - spec, shuffle_method=False, split_out=2 - ) - - # Check shuffle kwarg deprecation - with pytest.warns(match="'shuffle' keyword is deprecated"): - gddf.groupby("a", sort=True).agg(spec, shuffle=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index 0b7c7855e07..2d05345bc4a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -8,12 +8,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr - -# No dask-expr support -pytestmark = xfail_dask_expr( - "Newer dask version needed", lt_version="2024.5.0" -) def test_get_dummies_cat(): diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 02c815427f3..68d6e72660e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -10,7 +10,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -67,7 +66,6 @@ def test_sort_repartition(): dd.assert_eq(len(new_ddf), len(ddf)) -@xfail_dask_expr("missing null support", lt_version="2024.5.1") @pytest.mark.parametrize("na_position", ["first", "last"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index b44b3f939e7..ef6765f39d1 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -1,22 +1,12 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd -import pytest -from packaging.version import Version -import dask import dask.dataframe as dd import cudf -from dask_cudf import QUERY_PLANNING_ON - -if QUERY_PLANNING_ON: - DASK_VERSION = Version(dask.__version__) -else: - DASK_VERSION = None - def _make_random_frame(nelem, npartitions=2, include_na=False): rng = np.random.default_rng(seed=0) @@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): gdf = cudf.DataFrame.from_pandas(df) dgf = dd.from_pandas(gdf, npartitions=npartitions) return df, dgf - - -_default_reason = "Not compatible with dask-expr" - - -def skip_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - skip = QUERY_PLANNING_ON - return pytest.mark.skipif(skip, reason=reason) - - -def xfail_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - xfail = QUERY_PLANNING_ON - return pytest.mark.xfail(xfail, reason=reason) - - -def require_dask_expr(reason="requires dask-expr"): - return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index a8cb696d7f6..5b8b98c2b55 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -39,15 +39,15 @@ classifiers = [ ] [project.entry-points."dask.dataframe.backends"] -cudf = "dask_cudf.backends:CudfBackendEntrypoint" +cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint" [project.entry-points."dask_expr.dataframe.backends"] -cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" +cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.2.*,>=0.0.0a0", - "numba-cuda>=0.0.13,<0.0.18", + "numba-cuda>=0.2.0,<0.3.0", "pytest-cov", "pytest-xdist", "pytest<8", @@ -102,8 +102,5 @@ filterwarnings = [ # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", - # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 - # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` - "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] xfail_strict = true diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd index 2d070ddda69..fbd478f963f 100644 --- a/python/pylibcudf/pylibcudf/hashing.pxd +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t @@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128( uint64_t seed=* ) +cpdef Column xxhash_32( + Table input, + uint32_t seed=* +) cpdef Column xxhash_64( Table input, diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi index a849f5d0729..d535d842a18 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyi +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int] def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_32(input: Table, seed: int = ...) -> Column: ... def xxhash_64(input: Table, seed: int = ...) -> Column: ... def md5(input: Table) -> Column: ... def sha1(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 548cffc0ce8..1f093b20c6b 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport ( sha256 as cpp_sha256, sha384 as cpp_sha384, sha512 as cpp_sha512, + xxhash_32 as cpp_xxhash_32, xxhash_64 as cpp_xxhash_64, ) from pylibcudf.libcudf.table.table cimport table @@ -30,6 +31,7 @@ __all__ = [ "sha256", "sha384", "sha512", + "xxhash_32", "xxhash_64", ] @@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128( return Table.from_libcudf(move(c_result)) +cpdef Column xxhash_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + cpdef Column xxhash_64( Table input, uint64_t seed=DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 4e8a01b41a5..46fdf62cd6b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_32( + const table_view& input, + const uint32_t seed + ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 83fb50fa4ef..7096dbe14ff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import hashlib import struct @@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0): def hash_combine_32(lhs, rhs): - return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2))) + return np.uint32( + int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32 + ) def uint_hash_combine_32(lhs, rhs): @@ -80,22 +82,6 @@ def list_struct_table(): return data -def python_hash_value(x, method): - if method == "murmurhash3_x86_32": - return libcudf_mmh3_x86_32(x) - elif method == "murmurhash3_x64_128": - hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) - hasher.update(x) - # libcudf returns a tuple of two 64-bit integers - return hasher.utupledigest() - elif method == "xxhash_64": - return xxhash.xxh64( - x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - ).intdigest() - else: - return getattr(hashlib, method)(x).hexdigest() - - @pytest.mark.parametrize( "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] ) @@ -115,6 +101,23 @@ def py_hasher(val): assert_column_eq(got, expect) +def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh32( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.xxhash_32( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + assert_column_eq(got, expect) + + def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( @@ -125,7 +128,9 @@ def py_hasher(val): [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], type=pa.uint64(), ) - got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + got = plc.hashing.xxhash_64( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) assert_column_eq(got, expect)