diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 9943b02a521..0ea4d5c54dc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -67,7 +67,27 @@ jobs: node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" sha: ${{ inputs.sha }} + wheel-build-libcudf: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + script: ci/build_wheel_libcudf.sh + wheel-publish-libcudf: + needs: wheel-build-libcudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 + with: + build_type: ${{ inputs.build_type || 'branch' }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + package-name: libcudf wheel-build-pylibcudf: + needs: [wheel-publish-libcudf] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 74bdc666c68..2e2a8b6b9bc 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -23,6 +23,7 @@ jobs: - static-configure - conda-notebook-tests - docs-build + - wheel-build-libcudf - wheel-build-pylibcudf - wheel-build-cudf - wheel-tests-cudf @@ -121,10 +122,18 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" - wheel-build-pylibcudf: + wheel-build-libcudf: needs: checks secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 + with: + matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) + build_type: pull-request + script: "ci/build_wheel_libcudf.sh" + wheel-build-pylibcudf: + needs: [checks, wheel-build-libcudf] + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index 7c0fb1efebe..cf33703f544 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -5,16 +5,28 @@ set -euo pipefail package_dir="python/cudf" -export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" - -# Download the pylibcudf built in the previous step RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist -echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" > /tmp/constraints.txt +# Downloads libcudf and pylibcudf wheels from this current build, +# then ensures 'cudf' wheel builds always use the 'libcudf' and 'pylibcudf' just built in the same CI run. +# +# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints +# are used when creating the isolated build environment. +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python /tmp/pylibcudf_dist +echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt +echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" + ./ci/build_wheel.sh ${package_dir} -python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* +python -m auditwheel repair \ + --exclude libcudf.so \ + --exclude libarrow.so.1601 \ + --exclude libnvcomp.so \ + --exclude libnvcomp_bitcomp.so \ + --exclude libnvcomp_gdeflate.so \ + -w ${package_dir}/final_dist \ + ${package_dir}/dist/* -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh new file mode 100755 index 00000000000..9694c3f6144 --- /dev/null +++ b/ci/build_wheel_libcudf.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +set -euo pipefail + +package_dir="python/libcudf" + +./ci/build_wheel.sh ${package_dir} + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + +mkdir -p ${package_dir}/final_dist +python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/* + +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index b25d118ff81..7181a49d397 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -5,12 +5,26 @@ set -euo pipefail package_dir="python/pylibcudf" -export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -./ci/build_wheel.sh ${package_dir} +# Downloads libcudf wheel from this current build, +# then ensures 'pylibcudf' wheel builds always use the 'libcudf' just built in the same CI run. +# +# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints +# are used when creating the isolated build environment. +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist +echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt +export PIP_CONSTRAINT="/tmp/constraints.txt" -python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* +./ci/build_wheel.sh ${package_dir} +python -m auditwheel repair \ + --exclude libcudf.so \ + --exclude libarrow.so.1601 \ + --exclude libnvcomp.so \ + --exclude libnvcomp_bitcomp.so \ + --exclude libnvcomp_gdeflate.so \ + -w ${package_dir}/final_dist \ + ${package_dir}/dist/* -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py new file mode 100644 index 00000000000..b6913f947e8 --- /dev/null +++ b/ci/cudf_pandas_scripts/fetch_pandas_versions.py @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import requests +from packaging.version import Version +from packaging.specifiers import SpecifierSet +import argparse + +def get_pandas_versions(pandas_range): + url = "https://pypi.org/pypi/pandas/json" + response = requests.get(url) + data = response.json() + versions = [Version(v) for v in data['releases']] + specifier = SpecifierSet(pandas_range.lstrip("pandas")) + matching_versions = [v for v in versions if v in specifier] + matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version) + return matching_minors + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.") + parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.") + args = parser.parse_args() + + versions = get_pandas_versions(args.pandas_range) + print(','.join(versions)) diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index f3a497d3bf0..8487f663311 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -12,9 +12,10 @@ rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -# Download the cudf and pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +# Download the cudf, libcudf, and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then @@ -31,6 +32,7 @@ python -m pip install \ -v \ --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \ + "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 04dc1785e6e..ca4ab77925f 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -9,13 +9,20 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" +DEPENDENCIES_PATH="dependencies.yaml" +package_name="pandas" + +# Use grep to find the line containing the package name and version constraint +pandas_version_constraint=$(grep -oP "pandas>=\d+\.\d+,\<\d+\.\d+\.\d+dev\d+" $DEPENDENCIES_PATH) + # Function to display script usage function display_usage { - echo "Usage: $0 [--no-cudf]" + echo "Usage: $0 [--no-cudf] [pandas-version]" } # Default value for the --no-cudf option no_cudf=false +PANDAS_VERSION="" # Parse command-line arguments while [[ $# -gt 0 ]]; do @@ -25,9 +32,14 @@ while [[ $# -gt 0 ]]; do shift ;; *) - echo "Error: Unknown option $1" - display_usage - exit 1 + if [[ -z "$PANDAS_VERSION" ]]; then + PANDAS_VERSION=$1 + shift + else + echo "Error: Unknown option $1" + display_usage + exit 1 + fi ;; esac done @@ -37,9 +49,10 @@ if [ "$no_cudf" = true ]; then else RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - # Download the cudf and pylibcudf built in the previous step - RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist - RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist + # Download the cudf, libcudf, and pylibcudf built in the previous step + RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist + RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist + RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then @@ -55,6 +68,7 @@ else -v \ --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \ + "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" fi @@ -64,3 +78,19 @@ python -m pytest -p cudf.pandas \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ --cov-report=term \ ./python/cudf/cudf_pandas_tests/ + +output=$(python ci/cudf_pandas_scripts/fetch_pandas_versions.py $pandas_version_constraint) + +# Convert the comma-separated list into an array +IFS=',' read -r -a versions <<< "$output" + +for version in "${versions[@]}"; do + echo "Installing pandas version: ${version}" + python -m pip install "pandas==${version}" + python -m pytest -p cudf.pandas \ + --cov-config=./python/cudf/.coveragerc \ + --cov=cudf \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ + --cov-report=term \ + ./python/cudf/cudf_pandas_tests/ +done diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 132e58249e6..be55b49870f 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -49,8 +49,10 @@ DEPENDENCIES=( dask-cuda dask-cudf kvikio + libcudf libkvikio librmm + pylibcudf rapids-dask-dependency rmm ) @@ -77,7 +79,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}" sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}" done -sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh +sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh # Java files NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT" diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 340814e1f7e..c1003880d4b 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -5,9 +5,10 @@ set -eou pipefail RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -# Download the cudf and pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +# Download the cudf, libcudf, and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Install cudf, pylibcudf, and test requirements" @@ -26,6 +27,7 @@ python -m pip install \ -v \ --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 841794e08d6..9844090258a 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -18,10 +18,11 @@ else fi RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist -# Download pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Installing cudf_polars and its dependencies" # Constraint to minimum dependency versions if job is set up as "oldest" @@ -39,6 +40,7 @@ python -m pip install \ -v \ --constraint ./constraints.txt \ "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" rapids-logger "Run cudf_polars tests" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index d3b3fcb63bd..0d39807d56c 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -4,11 +4,12 @@ set -eou pipefail RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist -# Download the cudf and pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +# Download the cudf, libcudf, and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements" # Constraint to minimum dependency versions if job is set up as "oldest" @@ -27,6 +28,7 @@ python -m pip install \ --constraint ./constraints.txt \ "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ff00c484501..6b8bb26825b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -363,17 +363,14 @@ add_library( src/hash/sha512_hash.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp - src/interop/from_arrow.cu src/interop/arrow_utilities.cpp src/interop/decimal_conversion_utilities.cu - src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/to_arrow_host.cu src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu src/interop/from_arrow_stream.cu src/interop/to_arrow_schema.cpp - src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu src/io/avro/reader_impl.cu diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake index ec8cbd8c568..10e6b026d9a 100644 --- a/cpp/cmake/thirdparty/get_gtest.cmake +++ b/cpp/cmake/thirdparty/get_gtest.cmake @@ -16,18 +16,9 @@ function(find_and_configure_gtest) include(${rapids-cmake-dir}/cpm/gtest.cmake) - # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be - # used by consumers with a different shared gtest. - set(gtest_hide_internal_symbols ON) - # Find or install GoogleTest rapids_cpm_gtest(BUILD_STATIC) - # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be - # used by consumers with a different shared gtest. - if(TARGET gtest) - target_compile_definitions(gtest PUBLIC "$") - endif() endfunction() find_and_configure_gtest() diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 0b9319ba663..0d8f078c9d1 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -16,29 +16,13 @@ #pragma once -// We disable warning 611 because the `arrow::TableBatchReader` only partially -// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext` -// triggering warning 611-D from nvcc. -#ifdef __CUDACC__ -#pragma nv_diag_suppress 611 -#pragma nv_diag_suppress 2810 -#endif -#include - -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 611 -#pragma nv_diag_default 2810 -#endif - #include #include #include #include #include - -#include +#include namespace CUDF_EXPORT cudf { namespace detail { @@ -61,89 +45,6 @@ DLManagedTensor* to_dlpack(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); -// Creating arrow as per given type_id and buffer arguments -template -std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) -{ - switch (id) { - case type_id::BOOL8: return std::make_shared(std::forward(args)...); - case type_id::INT8: return std::make_shared(std::forward(args)...); - case type_id::INT16: return std::make_shared(std::forward(args)...); - case type_id::INT32: return std::make_shared(std::forward(args)...); - case type_id::INT64: return std::make_shared(std::forward(args)...); - case type_id::UINT8: return std::make_shared(std::forward(args)...); - case type_id::UINT16: return std::make_shared(std::forward(args)...); - case type_id::UINT32: return std::make_shared(std::forward(args)...); - case type_id::UINT64: return std::make_shared(std::forward(args)...); - case type_id::FLOAT32: return std::make_shared(std::forward(args)...); - case type_id::FLOAT64: return std::make_shared(std::forward(args)...); - case type_id::TIMESTAMP_DAYS: - return std::make_shared(std::make_shared(), - std::forward(args)...); - case type_id::TIMESTAMP_SECONDS: - return std::make_shared(arrow::timestamp(arrow::TimeUnit::SECOND), - std::forward(args)...); - case type_id::TIMESTAMP_MILLISECONDS: - return std::make_shared(arrow::timestamp(arrow::TimeUnit::MILLI), - std::forward(args)...); - case type_id::TIMESTAMP_MICROSECONDS: - return std::make_shared(arrow::timestamp(arrow::TimeUnit::MICRO), - std::forward(args)...); - case type_id::TIMESTAMP_NANOSECONDS: - return std::make_shared(arrow::timestamp(arrow::TimeUnit::NANO), - std::forward(args)...); - case type_id::DURATION_SECONDS: - return std::make_shared(arrow::duration(arrow::TimeUnit::SECOND), - std::forward(args)...); - case type_id::DURATION_MILLISECONDS: - return std::make_shared(arrow::duration(arrow::TimeUnit::MILLI), - std::forward(args)...); - case type_id::DURATION_MICROSECONDS: - return std::make_shared(arrow::duration(arrow::TimeUnit::MICRO), - std::forward(args)...); - case type_id::DURATION_NANOSECONDS: - return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), - std::forward(args)...); - default: CUDF_FAIL("Unsupported type_id conversion to arrow"); - } -} - -// Converting arrow type to cudf type -data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); - -/** - * @copydoc cudf::to_arrow(table_view input, std::vector const& metadata, - * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) - */ -std::shared_ptr to_arrow(table_view input, - std::vector const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr); - -/** - * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata, - * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) - */ -std::shared_ptr to_arrow(cudf::scalar const& input, - column_metadata const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr); -/** - * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, - * rmm::device_async_resource_ref mr) - */ -std::unique_ptr from_arrow(arrow::Table const& input_table, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -/** - * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, - * rmm::device_async_resource_ref mr) - */ -std::unique_ptr from_arrow(arrow::Scalar const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - /** * @brief Return a maximum precision for a given type. * diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 9a8f87b4a46..0f52b0f7b31 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -16,21 +16,6 @@ #pragma once -// We disable warning 611 because the `arrow::TableBatchReader` only partially -// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext` -// triggering warning 611-D from nvcc. -#ifdef __CUDACC__ -#pragma nv_diag_suppress 611 -#pragma nv_diag_suppress 2810 -#endif -#include - -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 611 -#pragma nv_diag_default 2810 -#endif - #include #include #include @@ -131,59 +116,6 @@ struct column_metadata { column_metadata() = default; }; -/** - * @brief Create `arrow::Table` from cudf table `input` - * - * Converts the `cudf::table_view` to `arrow::Table` with the provided - * metadata `column_names`. - * - * @deprecated Since 24.08. Use cudf::to_arrow_host instead. - * - * @throws cudf::logic_error if `column_names` size doesn't match with number of columns. - * - * @param input table_view that needs to be converted to arrow Table - * @param metadata Contains hierarchy of names of columns and children - * @param stream CUDA stream used for device memory operations and kernel launches - * @param ar_mr arrow memory pool to allocate memory for arrow Table - * @return arrow Table generated from `input` - * - * @note For decimals, since the precision is not stored for them in libcudf, - * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type - * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision - * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be - * converted to Arrow decimal128 of the precision 38. - */ -[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr to_arrow( - table_view input, - std::vector const& metadata = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); - -/** - * @brief Create `arrow::Scalar` from cudf scalar `input` - * - * Converts the `cudf::scalar` to `arrow::Scalar`. - * - * @deprecated Since 24.08. - * - * @param input scalar that needs to be converted to arrow Scalar - * @param metadata Contains hierarchy of names of columns and children - * @param stream CUDA stream used for device memory operations and kernel launches - * @param ar_mr arrow memory pool to allocate memory for arrow Scalar - * @return arrow Scalar generated from `input` - * - * @note For decimals, since the precision is not stored for them in libcudf, - * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type - * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision - * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be - * converted to Arrow decimal128 of the precision 38. - */ -[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr to_arrow( - cudf::scalar const& input, - column_metadata const& metadata = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); - /** * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter * @@ -386,39 +318,6 @@ unique_device_array_t to_arrow_host( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -/** - * @brief Create `cudf::table` from given arrow Table input - * - * @deprecated Since 24.08. Use cudf::from_arrow_host instead. - * - * @param input arrow:Table that needs to be converted to `cudf::table` - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate `cudf::table` - * @return cudf table generated from given arrow Table - */ -[[deprecated("Use cudf::from_arrow_host")]] std::unique_ptr
from_arrow( - arrow::Table const& input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Create `cudf::scalar` from given arrow Scalar input - * - * @deprecated Since 24.08. Use arrow's `MakeArrayFromScalar` on the - * input, followed by `ExportArray` to obtain something that can be - * consumed by `from_arrow_host`. Then use `cudf::get_element` to - * extract a device scalar from the column. - * - * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar` - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate `cudf::scalar` - * @return cudf scalar generated from given arrow Scalar - */ -[[deprecated("See docstring for migration strategies")]] std::unique_ptr from_arrow( - arrow::Scalar const& input, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - /** * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input * diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp deleted file mode 100644 index 2a19a5360fe..00000000000 --- a/cpp/src/interop/detail/arrow_allocator.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -namespace cudf { -namespace detail { - -/* - Enable Transparent Huge Pages (THP) for large (>4MB) allocations. - `buf` is returned untouched. - Enabling THP can improve performance of device-host memory transfers - significantly, see . -*/ -template -T enable_hugepage(T&& buf) -{ - if (buf->size() < (1u << 22u)) { // Smaller than 4 MB - return std::move(buf); - } - -#ifdef MADV_HUGEPAGE - auto const pagesize = sysconf(_SC_PAGESIZE); - void* addr = const_cast(buf->data()); - if (addr == nullptr) { return std::move(buf); } - auto length{static_cast(buf->size())}; - if (std::align(pagesize, pagesize, addr, length)) { - // Intentionally not checking for errors that may be returned by older kernel versions; - // optimistically tries enabling huge pages. - madvise(addr, length, MADV_HUGEPAGE); - } -#endif - return std::move(buf); -} - -std::unique_ptr allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr) -{ - /* - nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer - and `ValueOrDie` are used inside a CUDA compilation unit. - - To work around this issue we compile an allocation shim in C++ and use - that from our cuda sources - */ - arrow::Result> result = arrow::AllocateBuffer(size, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer"); - return enable_hugepage(std::move(result).ValueOrDie()); -} - -std::shared_ptr allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr) -{ - /* - nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer - and `ValueOrDie` are used inside a CUDA compilation unit. - - To work around this issue we compile an allocation shim in C++ and use - that from our cuda sources - */ - arrow::Result> result = arrow::AllocateBitmap(size, ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap"); - return enable_hugepage(std::move(result).ValueOrDie()); -} - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp deleted file mode 100644 index 75c1baa0dca..00000000000 --- a/cpp/src/interop/detail/arrow_allocator.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cudf { -namespace detail { - -// unique_ptr because that is what AllocateBuffer returns -std::unique_ptr allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr); - -// shared_ptr because that is what AllocateBitmap returns -std::shared_ptr allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr); - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu deleted file mode 100644 index 579820cbae3..00000000000 --- a/cpp/src/interop/from_arrow.cu +++ /dev/null @@ -1,524 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -namespace cudf { - -namespace detail { -data_type arrow_to_cudf_type(arrow::DataType const& arrow_type) -{ - switch (arrow_type.id()) { - case arrow::Type::NA: return data_type(type_id::EMPTY); - case arrow::Type::BOOL: return data_type(type_id::BOOL8); - case arrow::Type::INT8: return data_type(type_id::INT8); - case arrow::Type::INT16: return data_type(type_id::INT16); - case arrow::Type::INT32: return data_type(type_id::INT32); - case arrow::Type::INT64: return data_type(type_id::INT64); - case arrow::Type::UINT8: return data_type(type_id::UINT8); - case arrow::Type::UINT16: return data_type(type_id::UINT16); - case arrow::Type::UINT32: return data_type(type_id::UINT32); - case arrow::Type::UINT64: return data_type(type_id::UINT64); - case arrow::Type::FLOAT: return data_type(type_id::FLOAT32); - case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64); - case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case arrow::Type::TIMESTAMP: { - auto type = static_cast(&arrow_type); - switch (type->unit()) { - case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS); - case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); - case arrow::TimeUnit::type::MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); - case arrow::TimeUnit::type::NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); - default: CUDF_FAIL("Unsupported timestamp unit in arrow"); - } - } - case arrow::Type::DURATION: { - auto type = static_cast(&arrow_type); - switch (type->unit()) { - case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS); - case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS); - case arrow::TimeUnit::type::MICRO: return data_type(type_id::DURATION_MICROSECONDS); - case arrow::TimeUnit::type::NANO: return data_type(type_id::DURATION_NANOSECONDS); - default: CUDF_FAIL("Unsupported duration unit in arrow"); - } - } - case arrow::Type::STRING: return data_type(type_id::STRING); - case arrow::Type::LARGE_STRING: return data_type(type_id::STRING); - case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32); - case arrow::Type::LIST: return data_type(type_id::LIST); - case arrow::Type::DECIMAL: { - auto const type = static_cast(&arrow_type); - return data_type{type_id::DECIMAL128, -type->scale()}; - } - case arrow::Type::STRUCT: return data_type(type_id::STRUCT); - default: CUDF_FAIL("Unsupported type_id conversion to cudf"); - } -} - -namespace { -/** - * @brief Functor to return column for a corresponding arrow array. column - * is formed from buffer underneath the arrow array along with any offset and - * change in length that array has. - */ -struct dispatch_to_cudf_column { - /** - * @brief Returns mask from an array without any offsets. - */ - std::unique_ptr get_mask_buffer(arrow::Array const& array, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - if (array.null_bitmap_data() == nullptr) { - return std::make_unique(0, stream, mr); - } - auto const null_bitmap_size = array.null_bitmap()->size(); - auto const allocation_size = - bitmask_allocation_size_bytes(static_cast(null_bitmap_size * CHAR_BIT)); - auto mask = std::make_unique(allocation_size, stream, mr); - auto mask_buffer = array.null_bitmap(); - CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), - reinterpret_cast(mask_buffer->address()), - null_bitmap_size, - cudaMemcpyDefault, - stream.value())); - // Zero-initialize trailing padding bytes - auto const num_trailing_bytes = allocation_size - null_bitmap_size; - if (num_trailing_bytes > 0) { - auto trailing_bytes = static_cast(mask->data()) + null_bitmap_size; - CUDF_CUDA_TRY(cudaMemsetAsync(trailing_bytes, 0, num_trailing_bytes, stream.value())); - } - return mask; - } - - template ())> - std::unique_ptr operator()( - arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref) - { - CUDF_FAIL("Unsupported type in from_arrow."); - } - - template ())> - std::unique_ptr operator()(arrow::Array const& array, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto data_buffer = array.data()->buffers[1]; - size_type const num_rows = array.length(); - auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; - auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_column_view = col->mutable_view(); - CUDF_CUDA_TRY(cudaMemcpyAsync( - mutable_column_view.data(), - reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(T), - sizeof(T) * num_rows, - cudaMemcpyDefault, - stream.value())); - if (has_nulls) { - auto tmp_mask = get_mask_buffer(array, stream, mr); - - // If array is sliced, we have to copy whole mask and then take copy. - auto out_mask = (num_rows == static_cast(data_buffer->size() / sizeof(T))) - ? std::move(*tmp_mask) - : cudf::detail::copy_bitmask(static_cast(tmp_mask->data()), - array.offset(), - array.offset() + num_rows, - stream, - mr); - - col->set_null_mask(std::move(out_mask), array.null_count()); - } - - return col; - } -}; - -std::unique_ptr get_empty_type_column(size_type size) -{ - // this abomination is required by cuDF Python, which needs to handle - // [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html) - // of finite length - return std::make_unique( - data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size); -} - -/** - * @brief Returns cudf column formed from given arrow array - * This has been introduced to take care of compiler error "error: explicit specialization of - * function must precede its first use" - */ -std::unique_ptr get_column(arrow::Array const& array, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()( - arrow::Array const& array, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using DeviceType = __int128_t; - - auto data_buffer = array.data()->buffers[1]; - auto const num_rows = static_cast(array.length()); - auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_column_view = col->mutable_view(); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - mutable_column_view.data(), - reinterpret_cast(data_buffer->address()) + array.offset() * sizeof(DeviceType), - sizeof(DeviceType) * num_rows, - cudaMemcpyDefault, - stream.value())); - - auto null_mask = [&] { - if (not skip_mask and array.null_bitmap_data()) { - auto temp_mask = get_mask_buffer(array, stream, mr); - // If array is sliced, we have to copy whole mask and then take copy. - return (num_rows == static_cast(data_buffer->size() / sizeof(DeviceType))) - ? std::move(*temp_mask.release()) - : cudf::detail::copy_bitmask(static_cast(temp_mask->data()), - array.offset(), - array.offset() + num_rows, - stream, - mr); - } - return rmm::device_buffer{}; - }(); - - col->set_null_mask(std::move(null_mask), array.null_count()); - return col; -} - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()(arrow::Array const& array, - data_type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto data_buffer = array.data()->buffers[1]; - // mask-to-bools expects the mask to be bitmask_type aligned/padded - auto data = rmm::device_buffer( - cudf::bitmask_allocation_size_bytes(data_buffer->size() * CHAR_BIT), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), - reinterpret_cast(data_buffer->address()), - data_buffer->size(), - cudaMemcpyDefault, - stream.value())); - auto out_col = mask_to_bools(static_cast(data.data()), - array.offset(), - array.offset() + array.length(), - stream, - mr); - - auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr; - if (has_nulls) { - auto out_mask = - detail::copy_bitmask(static_cast(get_mask_buffer(array, stream, mr)->data()), - array.offset(), - array.offset() + array.length(), - stream, - mr); - - out_col->set_null_mask(std::move(out_mask), array.null_count()); - } - - return out_col; -} - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()( - arrow::Array const& array, - data_type, - bool, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - if (array.length() == 0) { return make_empty_column(type_id::STRING); } - - std::unique_ptr offsets_column; - std::unique_ptr char_array; - - if (array.type_id() == arrow::Type::LARGE_STRING) { - auto str_array = static_cast(&array); - auto offset_array = std::make_unique( - str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr); - offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT64), true, stream, mr); - char_array = std::make_unique( - str_array->value_data()->size(), str_array->value_data(), nullptr); - } else if (array.type_id() == arrow::Type::STRING) { - auto str_array = static_cast(&array); - auto offset_array = std::make_unique( - str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr); - offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, stream, mr); - char_array = std::make_unique( - str_array->value_data()->size(), str_array->value_data(), nullptr); - } else { - throw std::runtime_error("Unsupported array type"); - } - - rmm::device_buffer chars(char_array->length(), stream, mr); - auto data_buffer = char_array->data()->buffers[1]; - CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), - reinterpret_cast(data_buffer->address()), - chars.size(), - cudaMemcpyDefault, - stream.value())); - - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars), - array.null_count(), - std::move(*get_mask_buffer(array, stream, mr))); - - return num_rows == array.length() - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(array.offset()), - static_cast(array.offset() + array.length()), - stream), - stream, - mr); -} - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()( - arrow::Array const& array, - data_type, - bool, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto dict_array = static_cast(&array); - auto dict_type = arrow_to_cudf_type(*(dict_array->dictionary()->type())); - auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, stream, mr); - auto ind_type = arrow_to_cudf_type(*(dict_array->indices()->type())); - - auto indices_column = get_column(*(dict_array->indices()), ind_type, false, stream, mr); - // If index type is not of type uint32_t, then cast it to uint32_t - auto const dict_indices_type = data_type{type_id::UINT32}; - if (indices_column->type().id() != dict_indices_type.id()) - indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, stream, mr); - - // Child columns shouldn't have masks and we need the mask in main column - auto column_contents = indices_column->release(); - indices_column = std::make_unique(dict_indices_type, - static_cast(array.length()), - std::move(*(column_contents.data)), - rmm::device_buffer{}, - 0); - - return make_dictionary_column(std::move(keys_column), - std::move(indices_column), - std::move(*(column_contents.null_mask)), - array.null_count()); -} - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()( - arrow::Array const& array, - data_type, - bool, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto struct_array = static_cast(&array); - std::vector> child_columns; - // Offsets have already been applied to child - arrow::ArrayVector array_children = struct_array->fields(); - std::transform(array_children.cbegin(), - array_children.cend(), - std::back_inserter(child_columns), - [&mr, &stream](auto const& child_array) { - auto type = arrow_to_cudf_type(*(child_array->type())); - return get_column(*child_array, type, false, stream, mr); - }); - - auto out_mask = std::move(*(get_mask_buffer(array, stream, mr))); - if (struct_array->null_bitmap_data() != nullptr) { - out_mask = detail::copy_bitmask(static_cast(out_mask.data()), - array.offset(), - array.offset() + array.length(), - stream, - mr); - } - - return make_structs_column( - array.length(), move(child_columns), array.null_count(), std::move(out_mask), stream, mr); -} - -template <> -std::unique_ptr dispatch_to_cudf_column::operator()( - arrow::Array const& array, - data_type, - bool, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto list_array = static_cast(&array); - auto offset_array = std::make_unique( - list_array->value_offsets()->size() / sizeof(int32_t), list_array->value_offsets(), nullptr); - auto offsets_column = dispatch_to_cudf_column{}.operator()( - *offset_array, data_type(type_id::INT32), true, stream, mr); - - auto child_type = arrow_to_cudf_type(*(list_array->values()->type())); - auto child_column = get_column(*(list_array->values()), child_type, false, stream, mr); - - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - array.null_count(), - std::move(*get_mask_buffer(array, stream, mr)), - stream, - mr); - - return num_rows == array.length() - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(array.offset()), - static_cast(array.offset() + array.length()), - stream), - stream, - mr); -} - -std::unique_ptr get_column(arrow::Array const& array, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - return type.id() != type_id::EMPTY - ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr) - : get_empty_type_column(array.length()); -} - -} // namespace - -std::unique_ptr
from_arrow(arrow::Table const& input_table, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - if (input_table.num_columns() == 0) { return std::make_unique
(); } - std::vector> columns; - auto chunked_arrays = input_table.columns(); - std::transform(chunked_arrays.begin(), - chunked_arrays.end(), - std::back_inserter(columns), - [&mr, &stream](auto const& chunked_array) { - std::vector> concat_columns; - auto cudf_type = arrow_to_cudf_type(*(chunked_array->type())); - auto array_chunks = chunked_array->chunks(); - if (cudf_type.id() == type_id::EMPTY) { - return get_empty_type_column(chunked_array->length()); - } - std::transform(array_chunks.begin(), - array_chunks.end(), - std::back_inserter(concat_columns), - [&cudf_type, &mr, &stream](auto const& array_chunk) { - return get_column(*array_chunk, cudf_type, false, stream, mr); - }); - if (concat_columns.empty()) { - return std::make_unique( - cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); - } else if (concat_columns.size() == 1) { - return std::move(concat_columns[0]); - } - - std::vector column_views; - std::transform(concat_columns.begin(), - concat_columns.end(), - std::back_inserter(column_views), - [](auto const& col) { return col->view(); }); - return cudf::detail::concatenate(column_views, stream, mr); - }); - - return std::make_unique
(std::move(columns)); -} - -std::unique_ptr from_arrow(arrow::Scalar const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto maybe_array = arrow::MakeArrayFromScalar(input, 1); - if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); } - auto array = *maybe_array; - - auto field = arrow::field("", input.type); - - auto table = arrow::Table::Make(arrow::schema({field}), {array}); - - auto cudf_table = detail::from_arrow(*table, stream, mr); - - auto cv = cudf_table->view().column(0); - return get_element(cv, 0, stream); -} - -} // namespace detail - -std::unique_ptr
from_arrow(arrow::Table const& input_table, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - return detail::from_arrow(input_table, stream, mr); -} - -std::unique_ptr from_arrow(arrow::Scalar const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - return detail::from_arrow(input, stream, mr); -} -} // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu deleted file mode 100644 index a867d4adfa1..00000000000 --- a/cpp/src/interop/to_arrow.cu +++ /dev/null @@ -1,490 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "arrow_utilities.hpp" -#include "decimal_conversion_utilities.cuh" -#include "detail/arrow_allocator.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -namespace cudf { -namespace detail { -namespace { - -/** - * @brief Create arrow data buffer from given cudf column - */ -template -std::shared_ptr fetch_data_buffer(device_span input, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - int64_t const data_size_in_bytes = sizeof(T) * input.size(); - - auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - input.data(), - data_size_in_bytes, - cudaMemcpyDefault, - stream.value())); - - return std::move(data_buffer); -} - -/** - * @brief Create arrow buffer of mask from given cudf column - */ -std::shared_ptr fetch_mask_buffer(column_view input_view, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - int64_t const mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size()); - - if (input_view.has_nulls()) { - auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( - mask_buffer->mutable_data(), - (input_view.offset() > 0) - ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource()) - .data() - : input_view.null_mask(), - mask_size_in_bytes, - cudaMemcpyDefault, - stream.value())); - - // Resets all padded bits to 0 - mask_buffer->ZeroPadding(); - - return mask_buffer; - } - - return nullptr; -} - -/** - * @brief Functor to convert cudf column to arrow array - */ -struct dispatch_to_arrow { - /** - * @brief Creates vector Arrays from given cudf column children - */ - std::vector> fetch_child_array( - column_view input_view, - std::vector const& metadata, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) - { - std::vector> child_arrays; - std::transform( - input_view.child_begin(), - input_view.child_end(), - metadata.begin(), - std::back_inserter(child_arrays), - [&ar_mr, &stream](auto const& child, auto const& meta) { - return type_dispatcher( - child.type(), dispatch_to_arrow{}, child, child.type().id(), meta, ar_mr, stream); - }); - return child_arrays; - } - - template ())> - std::shared_ptr operator()( - column_view, cudf::type_id, column_metadata const&, arrow::MemoryPool*, rmm::cuda_stream_view) - { - CUDF_FAIL("Unsupported type for to_arrow."); - } - - template ())> - std::shared_ptr operator()(column_view input_view, - cudf::type_id id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) - { - return to_arrow_array( - id, - static_cast(input_view.size()), - fetch_data_buffer( - device_span(input_view.data(), input_view.size()), ar_mr, stream), - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); - } -}; - -// Convert decimal types from libcudf to arrow where those types are not -// directly supported by Arrow. These types must be fit into 128 bits, the -// smallest decimal resolution supported by Arrow. -template -std::shared_ptr unsupported_decimals_to_arrow(column_view input, - int32_t precision, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - auto buf = detail::convert_decimals_to_decimal128( - input, stream, rmm::mr::get_current_device_resource()); - - // Synchronize stream here to ensure the decimal128 buffer is ready. - stream.synchronize(); - - auto const buf_size_in_bytes = buf->size(); - auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - buf->data(), - buf_size_in_bytes, - cudaMemcpyDefault, - stream.value())); - - auto type = arrow::decimal(precision, -input.type().scale()); - auto mask = fetch_mask_buffer(input, ar_mr, stream); - auto buffers = std::vector>{mask, std::move(data_buffer)}; - auto data = std::make_shared(type, input.size(), buffers); - - return std::make_shared(data); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - using DeviceType = int32_t; - return unsupported_decimals_to_arrow( - input, cudf::detail::max_precision(), ar_mr, stream); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - using DeviceType = int64_t; - return unsupported_decimals_to_arrow( - input, cudf::detail::max_precision(), ar_mr, stream); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - using DeviceType = __int128_t; - auto const max_precision = cudf::detail::max_precision(); - - rmm::device_uvector buf(input.size(), stream); - - thrust::copy(rmm::exec_policy(stream), // - input.begin(), - input.end(), - buf.begin()); - - auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); - auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - - auto type = arrow::decimal(max_precision, -input.type().scale()); - auto mask = fetch_mask_buffer(input, ar_mr, stream); - auto buffers = std::vector>{mask, std::move(data_buffer)}; - auto data = std::make_shared(type, input.size(), buffers); - - return std::make_shared(data); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()(column_view input, - cudf::type_id id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource()); - - auto data_buffer = allocate_arrow_buffer(static_cast(bitmask.first->size()), ar_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - bitmask.first->data(), - bitmask.first->size(), - cudaMemcpyDefault, - stream.value())); - return to_arrow_array(id, - static_cast(input.size()), - std::move(data_buffer), - fetch_mask_buffer(input, ar_mr, stream), - static_cast(input.null_count())); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - std::unique_ptr tmp_column = - ((input.offset() != 0) or - ((input.num_children() == 1) and (input.child(0).size() - 1 != input.size()))) - ? std::make_unique(input, stream) - : nullptr; - - column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input; - auto child_arrays = fetch_child_array(input_view, {{}, {}}, ar_mr, stream); - if (child_arrays.empty()) { - // Empty string will have only one value in offset of 4 bytes - auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); - auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); - memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); - - return std::make_shared( - 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); - } - auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1]; - auto const sview = strings_column_view{input_view}; - auto data_buffer = fetch_data_buffer( - device_span{sview.chars_begin(stream), - static_cast(sview.chars_size(stream))}, - ar_mr, - stream); - if (sview.offsets().type().id() == cudf::type_id::INT64) { - return std::make_shared(static_cast(input_view.size()), - offset_buffer, - data_buffer, - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); - } else { - return std::make_shared(static_cast(input_view.size()), - offset_buffer, - data_buffer, - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); - } -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const& metadata, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(metadata.children_meta.size() == static_cast(input.num_children()), - "Number of field names and number of children doesn't match\n"); - std::unique_ptr tmp_column = nullptr; - - if (input.offset() != 0) { tmp_column = std::make_unique(input, stream); } - - column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input; - auto child_arrays = fetch_child_array(input_view, metadata.children_meta, ar_mr, stream); - auto mask = fetch_mask_buffer(input_view, ar_mr, stream); - - std::vector> fields; - std::transform(child_arrays.cbegin(), - child_arrays.cend(), - metadata.children_meta.cbegin(), - std::back_inserter(fields), - [](auto const array, auto const meta) { - return std::make_shared( - meta.name, array->type(), array->null_count() > 0); - }); - auto dtype = std::make_shared(fields); - - return std::make_shared(dtype, - static_cast(input_view.size()), - child_arrays, - mask, - static_cast(input_view.null_count())); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const& metadata, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(metadata.children_meta.empty() || - metadata.children_meta.size() == static_cast(input.num_children()), - "Number of field names and number of children do not match\n"); - std::unique_ptr tmp_column = nullptr; - if ((input.offset() != 0) or - ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) { - tmp_column = std::make_unique(input, stream); - } - - column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input; - auto children_meta = - metadata.children_meta.empty() ? std::vector{{}, {}} : metadata.children_meta; - auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream); - if (child_arrays.empty() || child_arrays[0]->data()->length == 0) { - auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type(); - auto result = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr); - CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n"); - return result.ValueUnsafe(); - } - - auto offset_buffer = child_arrays[0]->data()->buffers[1]; - auto data = child_arrays[1]; - return std::make_shared(arrow::list(data->type()), - static_cast(input_view.size()), - offset_buffer, - data, - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); -} - -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const& metadata, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) -{ - // Arrow dictionary requires indices to be signed integer - std::unique_ptr dict_indices = - detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(), - cudf::data_type{type_id::INT32}, - stream, - rmm::mr::get_current_device_resource()); - auto indices = dispatch_to_arrow{}.operator()( - dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream); - auto dict_keys = cudf::dictionary_column_view(input).keys(); - auto dictionary = - type_dispatcher(dict_keys.type(), - dispatch_to_arrow{}, - dict_keys, - dict_keys.type().id(), - metadata.children_meta.empty() ? column_metadata{} : metadata.children_meta[0], - ar_mr, - stream); - - return std::make_shared( - arrow::dictionary(indices->type(), dictionary->type()), indices, dictionary); -} -} // namespace - -std::shared_ptr to_arrow(table_view input, - std::vector const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr) -{ - CUDF_EXPECTS((metadata.size() == static_cast(input.num_columns())), - "columns' metadata should be equal to number of columns in table"); - - std::vector> arrays; - std::vector> fields; - - std::transform( - input.begin(), - input.end(), - metadata.begin(), - std::back_inserter(arrays), - [&](auto const& c, auto const& meta) { - return c.type().id() != type_id::EMPTY - ? type_dispatcher( - c.type(), detail::dispatch_to_arrow{}, c, c.type().id(), meta, ar_mr, stream) - : std::make_shared(c.size()); - }); - - std::transform( - arrays.begin(), - arrays.end(), - metadata.begin(), - std::back_inserter(fields), - [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); }); - - auto result = arrow::Table::Make(arrow::schema(fields), arrays); - - // synchronize the stream because after the return the data may be accessed from the host before - // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host - // memory is used). - stream.synchronize(); - - return result; -} - -std::shared_ptr to_arrow(cudf::scalar const& input, - column_metadata const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr) -{ - auto const column = cudf::make_column_from_scalar(input, 1, stream); - cudf::table_view const tv{{column->view()}}; - auto const arrow_table = detail::to_arrow(tv, {metadata}, stream, ar_mr); - auto const ac = arrow_table->column(0); - auto const maybe_scalar = ac->GetScalar(0); - if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } - return maybe_scalar.ValueOrDie(); -} -} // namespace detail - -std::shared_ptr to_arrow(table_view input, - std::vector const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr) -{ - CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, stream, ar_mr); -} - -std::shared_ptr to_arrow(cudf::scalar const& input, - column_metadata const& metadata, - rmm::cuda_stream_view stream, - arrow::MemoryPool* ar_mr) -{ - CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, stream, ar_mr); -} -} // namespace cudf diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp index f8a52c03d4e..3be739ec5bf 100644 --- a/cpp/src/rolling/jit/operation.hpp +++ b/cpp/src/rolling/jit/operation.hpp @@ -14,12 +14,12 @@ * limitations under the License. */ +#pragma once + #include "rolling/jit/operation-udf.hpp" #include -#pragma once - struct rolling_udf_ptx { template static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count) diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index 08eada632a5..70a9fe64d70 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -30,11 +32,65 @@ #include #include +#include #include -#include - -#pragma once +// Creating arrow as per given type_id and buffer arguments +template +std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) +{ + switch (id) { + case cudf::type_id::BOOL8: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT8: return std::make_shared(std::forward(args)...); + case cudf::type_id::INT16: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT8: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT16: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::FLOAT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::FLOAT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::TIMESTAMP_DAYS: + return std::make_shared(std::make_shared(), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_SECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::SECOND), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_MILLISECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::MILLI), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_MICROSECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::MICRO), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_NANOSECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::NANO), + std::forward(args)...); + case cudf::type_id::DURATION_SECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::SECOND), + std::forward(args)...); + case cudf::type_id::DURATION_MILLISECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MILLI), + std::forward(args)...); + case cudf::type_id::DURATION_MICROSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MICRO), + std::forward(args)...); + case cudf::type_id::DURATION_NANOSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), + std::forward(args)...); + default: CUDF_FAIL("Unsupported type_id conversion to arrow"); + } +} template std::enable_if_t() and !std::is_same_v, @@ -50,7 +106,7 @@ get_arrow_array(std::vector const& data, std::vector const& mask = { std::shared_ptr mask_buffer = mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie(); - return cudf::detail::to_arrow_array(cudf::type_to_id(), data.size(), data_buffer, mask_buffer); + return to_arrow_array(cudf::type_to_id(), data.size(), data_buffer, mask_buffer); } template diff --git a/dependencies.yaml b/dependencies.yaml index b7c5c865d9b..7fb29572921 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -13,6 +13,7 @@ files: - cuda - cuda_version - depends_on_cupy + - depends_on_librmm - depends_on_rmm - develop - docs @@ -95,6 +96,8 @@ files: - build_base - build_python_common - depends_on_pylibcudf + - depends_on_libcudf + - depends_on_librmm - depends_on_rmm py_run_cudf: output: pyproject @@ -106,6 +109,7 @@ files: - run_cudf - pyarrow_run - depends_on_cupy + - depends_on_libcudf - depends_on_pylibcudf - depends_on_rmm py_test_cudf: @@ -117,6 +121,31 @@ files: includes: - test_python_common - test_python_cudf + py_build_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: build-system + includes: + - rapids_build_skbuild + py_rapids_build_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: tool.rapids-build-backend + key: requires + includes: + - build_base + - build_cpp + - build_python_libcudf + - depends_on_librmm + py_run_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: project + includes: + - pyarrow_run py_build_pylibcudf: output: pyproject pyproject_dir: python/pylibcudf @@ -133,6 +162,8 @@ files: includes: - build_base - build_python_common + - depends_on_libcudf + - depends_on_librmm - depends_on_rmm py_run_pylibcudf: output: pyproject @@ -140,6 +171,7 @@ files: extras: table: project includes: + - depends_on_libcudf - depends_on_rmm - pyarrow_run - run_pylibcudf @@ -359,13 +391,18 @@ dependencies: - cython>=3.0.3 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - pyarrow==16.1.0.* + - &pyarrow_build pyarrow==16.1.0.* - output_types: pyproject packages: # Hard pin the patch version used during the build. # Sync with conda build constraint & wheel run constraint. # TODO: Change to `2.0.*` for NumPy 2 - numpy==1.23.* + build_python_libcudf: + common: + - output_types: [conda, requirements, pyproject] + packages: + - *pyarrow_build libarrow_build: common: - output_types: conda @@ -781,6 +818,31 @@ dependencies: packages: - dask-cuda==24.10.*,>=0.0.0a0 - *numba + depends_on_libcudf: + common: + - output_types: conda + packages: + - &libcudf_unsuffixed libcudf==24.10.*,>=0.0.0a0 + - output_types: requirements + packages: + # pip recognizes the index as a global option for the requirements.txt file + # This index is needed for libcudf-cu{11,12}. + - --extra-index-url=https://pypi.nvidia.com + - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + cuda_suffixed: "true" + packages: + - libcudf-cu12==24.10.*,>=0.0.0a0 + - matrix: + cuda: "11.*" + cuda_suffixed: "true" + packages: + - libcudf-cu11==24.10.*,>=0.0.0a0 + - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_pylibcudf: common: - output_types: conda @@ -871,6 +933,33 @@ dependencies: packages: &cupy_packages_cu11 - cupy-cuda11x>=12.0.0 - {matrix: null, packages: *cupy_packages_cu11} + depends_on_librmm: + common: + - output_types: conda + packages: + - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0 + - output_types: requirements + packages: + # pip recognizes the index as a global option for the requirements.txt file + # This index is needed for librmm-cu{11,12}. + - --extra-index-url=https://pypi.nvidia.com + - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + cuda_suffixed: "true" + packages: + - librmm-cu12==24.10.*,>=0.0.0a0 + - matrix: + cuda: "11.*" + cuda_suffixed: "true" + packages: + - librmm-cu11==24.10.*,>=0.0.0a0 + - matrix: + packages: + - *librmm_unsuffixed depends_on_rmm: common: - output_types: conda diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index cdc5aa41abe..9b718b2ed83 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -38,12 +38,70 @@ #include #include +#include #include using cudf::jni::ptr_as_jlong; using cudf::jni::release_as_jlong; +// Creating arrow as per given type_id and buffer arguments +template +std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) +{ + switch (id) { + case cudf::type_id::BOOL8: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT8: return std::make_shared(std::forward(args)...); + case cudf::type_id::INT16: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::INT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT8: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT16: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::UINT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::FLOAT32: + return std::make_shared(std::forward(args)...); + case cudf::type_id::FLOAT64: + return std::make_shared(std::forward(args)...); + case cudf::type_id::TIMESTAMP_DAYS: + return std::make_shared(std::make_shared(), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_SECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::SECOND), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_MILLISECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::MILLI), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_MICROSECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::MICRO), + std::forward(args)...); + case cudf::type_id::TIMESTAMP_NANOSECONDS: + return std::make_shared(arrow::timestamp(arrow::TimeUnit::NANO), + std::forward(args)...); + case cudf::type_id::DURATION_SECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::SECOND), + std::forward(args)...); + case cudf::type_id::DURATION_MILLISECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MILLI), + std::forward(args)...); + case cudf::type_id::DURATION_MICROSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::MICRO), + std::forward(args)...); + case cudf::type_id::DURATION_NANOSECONDS: + return std::make_shared(arrow::duration(arrow::TimeUnit::NANO), + std::forward(args)...); + default: CUDF_FAIL("Unsupported type_id conversion to arrow"); + } +} + extern "C" { JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence( @@ -141,15 +199,27 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env, break; default: // this handles the primitive types - arrow_array = cudf::detail::to_arrow_array( - n_type, j_col_length, data_buffer, null_buffer, j_null_count); + arrow_array = to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count); } auto name_and_type = arrow::field("col", arrow_array->type()); std::vector> fields = {name_and_type}; std::shared_ptr schema = std::make_shared(fields); auto arrow_table = arrow::Table::Make(schema, std::vector>{arrow_array}); - auto retCols = cudf::from_arrow(*(arrow_table))->release(); + + ArrowSchema sch; + if (!arrow::ExportSchema(*arrow_table->schema(), &sch).ok()) { + JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0) + } + auto batch = arrow_table->CombineChunksToBatch().ValueOrDie(); + ArrowArray arr; + if (!arrow::ExportRecordBatch(*batch, &arr).ok()) { + JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0) + } + auto retCols = cudf::from_arrow(&sch, &arr)->release(); + arr.release(&arr); + sch.release(&sch); + if (retCols.size() != 1) { JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ecc551f1143..c749c8c84bf 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -54,6 +54,8 @@ #include +#include +#include #include #include @@ -1069,6 +1071,15 @@ void append_flattened_child_names(cudf::io::column_name_info const& info, } } +// Recursively make schema and its children nullable +void set_nullable(ArrowSchema* schema) +{ + schema->flags |= ARROW_FLAG_NULLABLE; + for (int i = 0; i < schema->n_children; ++i) { + set_nullable(schema->children[i]); + } +} + } // namespace } // namespace jni @@ -2635,7 +2646,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv // The pointer to the shared_ptr<> is returned as a jlong. using result_t = std::shared_ptr; - auto result = cudf::to_arrow(*tview, state->get_column_metadata(*tview)); + auto got_arrow_schema = cudf::to_arrow_schema(*tview, state->get_column_metadata(*tview)); + cudf::jni::set_nullable(got_arrow_schema.get()); + auto got_arrow_array = cudf::to_arrow_host(*tview); + auto batch = + arrow::ImportRecordBatch(&got_arrow_array->array, got_arrow_schema.get()).ValueOrDie(); + auto result = arrow::Table::FromRecordBatches({batch}).ValueOrDie(); + return ptr_as_jlong(new result_t{result}); } CATCH_STD(env, 0) @@ -2746,7 +2763,21 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arr try { cudf::jni::auto_set_device(env); - return convert_table_for_return(env, cudf::from_arrow(*(handle->get()))); + + ArrowSchema sch; + if (!arrow::ExportSchema(*handle->get()->schema(), &sch).ok()) { + JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0) + } + auto batch = handle->get()->CombineChunksToBatch().ValueOrDie(); + ArrowArray arr; + if (!arrow::ExportRecordBatch(*batch, &arr).ok()) { + JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0) + } + auto ret = cudf::from_arrow(&sch, &arr); + arr.release(&arr); + sch.release(&sch); + + return convert_table_for_return(env, ret); } CATCH_STD(env, 0) } diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index e11d62b3bd5..72f20b30052 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -24,72 +24,15 @@ project( LANGUAGES CXX CUDA ) -option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" - OFF -) -option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) - -# Find Python early so that later commands can use it -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) - -# If the user requested it we attempt to find CUDF. -if(FIND_CUDF_CPP) - include(rapids-cpm) - include(rapids-export) - include(rapids-find) - rapids_cpm_init() +find_package(cudf "${RAPIDS_VERSION}" REQUIRED) - if(USE_LIBARROW_FROM_PYARROW) - # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow - # libraries. These variables have no effect because we are always searching for arrow via - # pyarrow, but they must be set as they are required arguments to the function in - # get_arrow.cmake. - set(CUDF_USE_ARROW_STATIC OFF) - set(CUDF_ENABLE_ARROW_S3 OFF) - set(CUDF_ENABLE_ARROW_ORC OFF) - set(CUDF_ENABLE_ARROW_PYTHON OFF) - set(CUDF_ENABLE_ARROW_PARQUET OFF) - include(../../cpp/cmake/thirdparty/get_arrow.cmake) - endif() - - find_package(cudf "${RAPIDS_VERSION}" REQUIRED) - - # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack - # for the interop.pyx - include(../../cpp/cmake/thirdparty/get_dlpack.cmake) -else() - set(cudf_FOUND OFF) -endif() +# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack +# for the interop.pyx +include(rapids-cpm) +rapids_cpm_init() +include(../../cpp/cmake/thirdparty/get_dlpack.cmake) include(rapids-cython-core) - -if(NOT cudf_FOUND) - set(BUILD_TESTS OFF) - set(BUILD_BENCHMARKS OFF) - set(CUDF_BUILD_TESTUTIL OFF) - set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) - set(CUDA_STATIC_RUNTIME ON) - - add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL) - - # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous - # components like headers from libcudacxx, but we do need the libraries. However, we want to - # control where they are installed to. Since there are multiple subpackages of cudf._lib that - # require access to libcudf, we place the library and all its dependent artifacts in the cudf - # directory as a single source of truth and modify the other rpaths appropriately. - set(cython_lib_dir cudf) - include(../pylibcudf/cmake/Modules/WheelHelpers.cmake) - # TODO: This install is currently overzealous. We should only install the libraries that are - # downloaded by CPM during the build, not libraries that were found on the system. However, in - # practice right this would only be a problem is if libcudf was not found but some of the - # dependencies were, and we have no real use cases where that happens. - install_aliased_imported_targets( - TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp - DESTINATION ${cython_lib_dir} - ) -endif() - rapids_cython_init() include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index ccc45413de4..d7da42a1708 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,5 +1,15 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +# If libcudf was installed as a wheel, we must request it to load the library symbols. +# Otherwise, we assume that the library was installed in a system path that ld can find. +try: + import libcudf +except ModuleNotFoundError: + pass +else: + libcudf.load_library() + del libcudf + # _setup_numba _must be called before numba.cuda is imported, because # it sets the numba config variable responsible for enabling # Minor Version Compatibility. Setting it after importing numba.cuda has no effect. diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index d6182673308..5ea378fc0e5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -63,6 +63,7 @@ rapids_cython_create_modules( ) target_link_libraries(strings_udf PUBLIC cudf_strings_udf) +target_include_directories(interop PUBLIC "$") set(targets_using_arrow_headers avro csv orc json parquet) link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bcab009c102..bb38488eefb 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -54,7 +54,7 @@ class Column: @property def mask_ptr(self) -> int: ... def set_base_mask(self, value: Buffer | None) -> None: ... - def set_mask(self, value: Buffer | None) -> Self: ... + def set_mask(self, value: ColumnBase | Buffer | None) -> Self: ... @property def null_count(self) -> int: ... @property diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 29cb9d7bd12..fd89904e766 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -120,7 +120,7 @@ def _read_tzfile_as_columns( # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) + return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value] return tuple(transition_times_and_offsets) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 1fdaf9f8c07..a7e98e5218f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -984,9 +984,9 @@ def find_and_replace( ) replacement_col = catmap._data["index"].astype(replaced.codes.dtype) - replaced = column.as_column(replaced.codes) + replaced_codes = column.as_column(replaced.codes) output = libcudf.replace.replace( - replaced, to_replace_col, replacement_col + replaced_codes, to_replace_col, replacement_col ) result = column.build_categorical_column( @@ -1064,7 +1064,7 @@ def _validate_fillna_value( raise TypeError( "Cannot set a categorical with non-categorical data" ) - fill_value = fill_value._set_categories( + fill_value = cast(CategoricalColumn, fill_value)._set_categories( self.categories, ) return fill_value.codes.astype(self.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 27278120abb..60b4126ddd4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -553,7 +553,7 @@ def __setitem__(self, key: Any, value: Any): """ # Normalize value to scalar/column - value_normalized = ( + value_normalized: cudf.Scalar | ColumnBase = ( cudf.Scalar(value, dtype=self.dtype) if is_scalar(value) else as_column(value, dtype=self.dtype) @@ -609,9 +609,12 @@ def _scatter_by_slice( ) # step != 1, create a scatter map with arange - scatter_map = as_column( - rng, - dtype=cudf.dtype(np.int32), + scatter_map = cast( + cudf.core.column.NumericalColumn, + as_column( + rng, + dtype=cudf.dtype(np.int32), + ), ) return self._scatter_by_column(scatter_map, value) @@ -1111,11 +1114,16 @@ def argsort( if (ascending and self.is_monotonic_increasing) or ( not ascending and self.is_monotonic_decreasing ): - return as_column(range(len(self))) + return cast( + cudf.core.column.NumericalColumn, as_column(range(len(self))) + ) elif (ascending and self.is_monotonic_decreasing) or ( not ascending and self.is_monotonic_increasing ): - return as_column(range(len(self) - 1, -1, -1)) + return cast( + cudf.core.column.NumericalColumn, + as_column(range(len(self) - 1, -1, -1)), + ) else: return libcudf.sort.order_by( [self], [ascending], na_position, stable=True @@ -1752,7 +1760,7 @@ def as_column( nan_as_null: bool | None = None, dtype: Dtype | None = None, length: int | None = None, -): +) -> ColumnBase: """Create a Column from an arbitrary object Parameters diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 302f04a0e71..c6a39199e3b 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -256,7 +256,10 @@ def from_sequences( offset += len(data) offset_vals.append(offset) - offset_col = column.as_column(offset_vals, dtype=size_type_dtype) + offset_col = cast( + NumericalColumn, + column.as_column(offset_vals, dtype=size_type_dtype), + ) # Build ListColumn res = cls( @@ -338,7 +341,7 @@ def __init__(self, parent: ParentType): def get( self, - index: int, + index: int | ColumnLike, default: ScalarLike | ColumnLike | None = None, ) -> ParentType: """ diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a37355dfcda..90bec049831 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -142,7 +142,7 @@ def __setitem__(self, key: Any, value: Any): """ # Normalize value to scalar/column - device_value = ( + device_value: cudf.Scalar | ColumnBase = ( cudf.Scalar( value, dtype=self.dtype @@ -552,7 +552,7 @@ def _validate_fillna_value( ) -> cudf.Scalar | ColumnBase: """Align fill_value for .fillna based on column type.""" if is_scalar(fill_value): - cudf_obj = cudf.Scalar(fill_value) + cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value) if not as_column(cudf_obj).can_cast_safely(self.dtype): raise TypeError( f"Cannot safely cast non-equivalent " diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6f7508822d4..16e6908f308 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -776,11 +776,13 @@ def contains( # TODO: we silently ignore the `regex=` flag here if case is False: input_column = libstrings.to_lower(self._column) - pat = libstrings.to_lower(column.as_column(pat, dtype="str")) + col_pat = libstrings.to_lower( + column.as_column(pat, dtype="str") + ) else: input_column = self._column - pat = column.as_column(pat, dtype="str") - result_col = libstrings.contains_multiple(input_column, pat) + col_pat = column.as_column(pat, dtype="str") + result_col = libstrings.contains_multiple(input_column, col_pat) return self._return_or_inplace(result_col) def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 7aa3e5f8163..34076fa0060 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -6,7 +6,7 @@ import sys from collections import abc from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Callable, Mapping +from typing import TYPE_CHECKING, Any, Callable, Mapping, cast import numpy as np import pandas as pd @@ -35,7 +35,7 @@ class _NestedGetItemDict(dict): """ @classmethod - def from_zip(cls, data): + def from_zip(cls, data: abc.Iterator): """Create from zip, specialized factory for nesting.""" obj = cls() for key, value in data: @@ -91,12 +91,12 @@ class ColumnAccessor(abc.MutableMapping): column length and data.values() are all Columns """ - _data: dict[Any, ColumnBase] - _level_names: tuple[Any, ...] + _data: dict[abc.Hashable, ColumnBase] + _level_names: tuple[abc.Hashable, ...] def __init__( self, - data: abc.MutableMapping[Any, ColumnBase] | Self, + data: abc.MutableMapping[abc.Hashable, ColumnBase] | Self, multiindex: bool = False, level_names=None, rangeindex: bool = False, @@ -141,16 +141,16 @@ def __init__( f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}" ) - def __iter__(self): + def __iter__(self) -> abc.Iterator: return iter(self._data) - def __getitem__(self, key: Any) -> ColumnBase: + def __getitem__(self, key: abc.Hashable) -> ColumnBase: return self._data[key] - def __setitem__(self, key: Any, value: ColumnBase) -> None: + def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) - def __delitem__(self, key: Any) -> None: + def __delitem__(self, key: abc.Hashable) -> None: old_ncols = len(self._data) del self._data[key] new_ncols = len(self._data) @@ -186,7 +186,7 @@ def _from_columns_like_self( Whether to verify column length and type. """ if sys.version_info.major >= 3 and sys.version_info.minor >= 10: - data = zip(self.names, columns, strict=True) + data = zip(self.names, columns, strict=True) # type: ignore[call-overload] else: columns = list(columns) if len(columns) != len(self.names): @@ -205,7 +205,7 @@ def _from_columns_like_self( ) @property - def level_names(self) -> tuple[Any, ...]: + def level_names(self) -> tuple[abc.Hashable, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: @@ -221,7 +221,7 @@ def nlevels(self) -> int: return len(next(iter(self.keys()))) @property - def name(self) -> Any: + def name(self) -> abc.Hashable: return self.level_names[-1] @cached_property @@ -232,7 +232,7 @@ def nrows(self) -> int: return len(next(iter(self.values()))) @cached_property - def names(self) -> tuple[Any, ...]: + def names(self) -> tuple[abc.Hashable, ...]: return tuple(self.keys()) @cached_property @@ -291,7 +291,7 @@ def to_pandas_index(self) -> pd.Index: ) elif cudf.api.types.infer_dtype(self.names) == "integer": if len(self.names) == 1: - start = self.names[0] + start = cast(int, self.names[0]) return pd.RangeIndex( start=start, stop=start + 1, step=1, name=self.name ) @@ -299,7 +299,9 @@ def to_pandas_index(self) -> pd.Index: if len(uniques) == 1 and uniques[0] != 0: diff = uniques[0] new_range = range( - self.names[0], self.names[-1] + diff, diff + cast(int, self.names[0]), + cast(int, self.names[-1]) + diff, + diff, ) return pd.RangeIndex(new_range, name=self.name) result = pd.Index( @@ -310,7 +312,9 @@ def to_pandas_index(self) -> pd.Index: ) return result - def insert(self, name: Any, value: ColumnBase, loc: int = -1) -> None: + def insert( + self, name: abc.Hashable, value: ColumnBase, loc: int = -1 + ) -> None: """ Insert column into the ColumnAccessor at the specified location. @@ -457,7 +461,7 @@ def select_by_index(self, index: Any) -> Self: verify=False, ) - def swaplevel(self, i=-2, j=-1) -> Self: + def swaplevel(self, i: abc.Hashable = -2, j: abc.Hashable = -1) -> Self: """ Swap level i with level j. Calling this method does not change the ordering of the values. @@ -486,7 +490,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: # swap old keys for i and j for n, row in enumerate(self.names): - new_keys[n][i], new_keys[n][j] = row[j], row[i] + new_keys[n][i], new_keys[n][j] = row[j], row[i] # type: ignore[call-overload, index] new_dict.update({row: tuple(new_keys[n])}) # TODO: Change to deep=False when copy-on-write is default @@ -494,10 +498,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: # swap level_names for i and j new_names = list(self.level_names) - new_names[i], new_names[j] = new_names[j], new_names[i] + new_names[i], new_names[j] = new_names[j], new_names[i] # type: ignore[call-overload] return type(self)( - new_data, + new_data, # type: ignore[arg-type] multiindex=self.multiindex, level_names=new_names, rangeindex=self.rangeindex, @@ -505,7 +509,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: verify=False, ) - def set_by_label(self, key: Any, value: ColumnBase) -> None: + def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: """ Add (or modify) column by name. @@ -555,7 +559,7 @@ def _select_by_label_list_like(self, key: tuple) -> Self: verify=False, ) - def _select_by_label_grouped(self, key: Any) -> Self: + def _select_by_label_grouped(self, key: abc.Hashable) -> Self: result = self._grouped_data[key] if isinstance(result, column.ColumnBase): # self._grouped_data[key] = self._data[key] so skip validation @@ -606,8 +610,12 @@ def _select_by_label_slice(self, key: slice) -> Self: ) def _select_by_label_with_wildcard(self, key: tuple) -> Self: - key = self._pad_key(key, slice(None)) - data = {k: self._data[k] for k in self.names if _keys_equal(k, key)} + pad_key = self._pad_key(key, slice(None)) + data = { + k: self._data[k] + for k in self.names + if _keys_equal(k, pad_key) # type: ignore[arg-type] + } return type(self)( data, multiindex=self.multiindex, @@ -616,7 +624,9 @@ def _select_by_label_with_wildcard(self, key: tuple) -> Self: verify=False, ) - def _pad_key(self, key: Any, pad_value="") -> Any: + def _pad_key( + self, key: abc.Hashable, pad_value: str | slice = "" + ) -> abc.Hashable: """ Pad the provided key to a length equal to the number of levels. @@ -628,7 +638,9 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Mapping[Any, Any] | Callable, level: int | None = None + self, + mapper: Mapping[abc.Hashable, abc.Hashable] | Callable, + level: int | None = None, ) -> Self: """ Rename the specified levels of the given ColumnAccessor @@ -701,14 +713,14 @@ def rename_column(x): verify=False, ) - def droplevel(self, level) -> None: + def droplevel(self, level: int) -> None: # drop the nth level if level < 0: level += self.nlevels old_ncols = len(self._data) self._data = { - _remove_key_level(key, level): value + _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } new_ncols = len(self._data) @@ -722,7 +734,7 @@ def droplevel(self, level) -> None: self._clear_cache(old_ncols, new_ncols) -def _keys_equal(target: Any, key: Any) -> bool: +def _keys_equal(target: abc.Hashable, key: abc.Iterable) -> bool: """ Compare `key` to `target`. @@ -740,7 +752,7 @@ def _keys_equal(target: Any, key: Any) -> bool: return True -def _remove_key_level(key: Any, level: int) -> Any: +def _remove_key_level(key: tuple, level: int) -> abc.Hashable: """ Remove a level from key. If detupleize is True, and if only a single level remains, convert the tuple to a scalar. @@ -751,7 +763,9 @@ def _remove_key_level(key: Any, level: int) -> Any: return result -def _get_level(x, nlevels, level_names): +def _get_level( + x: abc.Hashable, nlevels: int, level_names: tuple[abc.Hashable, ...] +) -> abc.Hashable: """Get the level index from a level number or name. If given an integer, this function will handle wraparound for diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 6afbc0bbc65..16d8964f083 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast @@ -44,15 +44,17 @@ class GatherMap: If the map is not in bounds. """ - #: The gather map - column: "NumericalColumn" #: The number of rows the gather map has been validated for nrows: int #: Was the validation for nullify=True? nullify: bool def __init__(self, column: Any, nrows: int, *, nullify: bool): - self.column = cudf.core.column.as_column(column) + #: The gather map + self.column = cast( + cudf.core.column.NumericalColumn, + cudf.core.column.as_column(column), + ) self.nrows = nrows self.nullify = nullify if len(self.column) == 0: @@ -135,11 +137,12 @@ class BooleanMask: If the mask has the wrong number of rows """ - #: The boolean mask - column: "NumericalColumn" - def __init__(self, column: Any, nrows: int): - self.column = cudf.core.column.as_column(column) + #: The boolean mask + self.column = cast( + cudf.core.column.NumericalColumn, + cudf.core.column.as_column(column), + ) if self.column.dtype.kind != "b": raise TypeError("Boolean mask must have bool dtype") if len(column) != nrows: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 43693ec20b1..14b63c2b0d7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5830,7 +5830,7 @@ def from_records( df = cls._from_data( ColumnAccessor( - data=ca_data, + data=ca_data, # type: ignore[arg-type] multiindex=isinstance( columns, (pd.MultiIndex, cudf.MultiIndex) ), diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e46e24dd0d8..60253b9ae5d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -40,7 +40,7 @@ from cudf.core._base_index import BaseIndex from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, as_column +from cudf.core.column import ColumnBase, NumericalColumn, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap from cudf.core.dtypes import ListDtype @@ -3008,9 +3008,12 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: if stride != 1: return self._gather( GatherMap.from_column_unchecked( - as_column( - range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + cast( + NumericalColumn, + as_column( + range(start, stop, stride), + dtype=libcudf.types.size_type_dtype, + ), ), len(self), nullify=False, @@ -4761,10 +4764,13 @@ def _sample_axis_0( ): try: gather_map = GatherMap.from_column_unchecked( - cudf.core.column.as_column( - random_state.choice( - len(self), size=n, replace=replace, p=weights - ) + cast( + NumericalColumn, + cudf.core.column.as_column( + random_state.choice( + len(self), size=n, replace=replace, p=weights + ) + ), ), len(self), nullify=False, @@ -6599,7 +6605,7 @@ def _drop_rows_by_labels( level = 0 levels_index = obj.index.get_level_values(level) - if errors == "raise" and not labels.isin(levels_index).all(): + if errors == "raise" and not labels.isin(levels_index).all(): # type: ignore[union-attr] raise KeyError("One or more values not found in axis") if isinstance(level, int): @@ -6649,7 +6655,7 @@ def _drop_rows_by_labels( ) else: - if errors == "raise" and not labels.isin(obj.index).all(): + if errors == "raise" and not labels.isin(obj.index).all(): # type: ignore[union-attr] raise KeyError("One or more values not found in axis") if isinstance(labels, ColumnBase): diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index fac51a9e471..560f257c115 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -527,6 +527,7 @@ def read_parquet( engine="cudf", columns=None, storage_options=None, + filesystem=None, filters=None, row_groups=None, use_pandas_metadata=True, @@ -567,7 +568,9 @@ def read_parquet( # Start by trying construct a filesystem object, so we # can apply filters on remote file-systems fs, paths = ioutils._get_filesystem_and_paths( - path_or_data=filepath_or_buffer, storage_options=storage_options + path_or_data=filepath_or_buffer, + storage_options=storage_options, + filesystem=filesystem, ) # Normalize and validate filters diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index 6579fd23634..3b23a53091e 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -269,6 +269,28 @@ def test_read_parquet_ext( assert_eq(expect, got1) +def test_read_parquet_filesystem(s3_base, s3so, pdf): + fname = "data.0.parquet" + # NOTE: Need a unique bucket name when a glob pattern + # is used, otherwise fsspec seems to cache the bucket + # contents, and later tests using the same bucket name + # will fail. + bucket = "test_read_parquet_filesystem" + buffer = BytesIO() + pdf.to_parquet(path=buffer) + buffer.seek(0) + fs = get_fs_token_paths("s3://", mode="rb", storage_options=s3so)[0] + with s3_context( + s3_base=s3_base, + bucket=bucket, + files={fname: buffer}, + ): + # Check that a glob pattern works + path = f"s3://{bucket}/{'data.*.parquet'}" + got = cudf.read_parquet(path, filesystem=fs) + assert_eq(pdf, got) + + def test_read_parquet_multi_file(s3_base, s3so, pdf): fname_1 = "test_parquet_reader_multi_file_1.parquet" buffer_1 = BytesIO() diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 4ac9b63985f..18106e7475b 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -12,7 +12,7 @@ import fsspec.implementations.local import numpy as np import pandas as pd -from fsspec.core import get_fs_token_paths +from fsspec.core import expand_paths_if_needed, get_fs_token_paths from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial @@ -139,6 +139,9 @@ For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more details. +filesystem : fsspec.AbstractFileSystem, default None + Filesystem object to use when reading the parquet data. This argument + should not be used at the same time as `storage_options`. filters : list of tuple, list of lists of tuples, default None If not None, specifies a filter predicate used to filter out row groups using statistics stored for each row group as Parquet metadata. Row groups @@ -1536,11 +1539,18 @@ def is_directory(path_or_data, storage_options=None): return False -def _get_filesystem_and_paths(path_or_data, storage_options): +def _get_filesystem_and_paths( + path_or_data, + storage_options, + *, + filesystem=None, +): # Returns a filesystem object and the filesystem-normalized # paths. If `path_or_data` does not correspond to a path or # list of paths (or if the protocol is not supported), the # return will be `None` for the fs and `[]` for the paths. + # If a filesystem object is already available, it can be + # passed with the `filesystem` argument. fs = None return_paths = path_or_data @@ -1557,16 +1567,36 @@ def _get_filesystem_and_paths(path_or_data, storage_options): else: path_or_data = [path_or_data] - try: - fs, _, fs_paths = get_fs_token_paths( - path_or_data, mode="rb", storage_options=storage_options - ) - return_paths = fs_paths - except ValueError as e: - if str(e).startswith("Protocol not known"): - return None, [] - else: - raise e + if filesystem is None: + try: + fs, _, fs_paths = get_fs_token_paths( + path_or_data, mode="rb", storage_options=storage_options + ) + return_paths = fs_paths + except ValueError as e: + if str(e).startswith("Protocol not known"): + return None, [] + else: + raise e + else: + if not isinstance(filesystem, fsspec.AbstractFileSystem): + raise ValueError( + f"Expected fsspec.AbstractFileSystem. Got {filesystem}" + ) + + if storage_options: + raise ValueError( + f"Cannot specify storage_options when an explicit " + f"filesystem object is specified. Got: {storage_options}" + ) + + fs = filesystem + return_paths = [ + fs._strip_protocol(u) + for u in expand_paths_if_needed( + path_or_data, "rb", 1, fs, None + ) + ] return fs, return_paths diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 9db52164eca..cb9fa30afab 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "cuda-python>=11.7.1,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", + "libcudf==24.10.*,>=0.0.0a0", "numba>=0.57", "numpy>=1.23,<2.0a0", "nvtx>=0.2.1", @@ -126,6 +127,8 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", + "libcudf==24.10.*,>=0.0.0a0", + "librmm==24.10.*,>=0.0.0a0", "ninja", "numpy==1.23.*", "pyarrow==16.1.0.*", diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 16b2c8959e2..5bd3eb5fa7f 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -134,6 +134,8 @@ def _get_non_empty_data( return cudf.core.column.as_column( np.arange(start=0, stop=2, dtype=s.dtype) ) + elif isinstance(s.dtype, cudf.core.dtypes.DecimalDtype): + return cudf.core.column.as_column(range(2), dtype=s.dtype) else: raise TypeError( f"Don't know how to handle column of type {type(s).__name__}" diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 8f52fce7818..c025280c240 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,11 +23,7 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import ( - _ROW_GROUP_SIZE_BYTES_DEFAULT, - _fsspec_data_transfer, - _is_local_filesystem, -) +from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -93,40 +89,35 @@ def _read_paths( dataset_kwargs = dataset_kwargs or {} dataset_kwargs["partitioning"] = partitioning or "hive" - # Non-local filesystem handling - paths_or_fobs = paths - if not _is_local_filesystem(fs): - paths_or_fobs = [ - _fsspec_data_transfer(fpath, fs=fs) for fpath in paths - ] - # Use cudf to read in data try: df = cudf.read_parquet( - paths_or_fobs, + paths, engine="cudf", columns=columns, row_groups=row_groups if row_groups else None, dataset_kwargs=dataset_kwargs, categorical_partitions=False, + filesystem=fs, **kwargs, ) except RuntimeError as err: # TODO: Remove try/except after null-schema issue is resolved # (See: https://github.com/rapidsai/cudf/issues/12702) - if len(paths_or_fobs) > 1: + if len(paths) > 1: df = cudf.concat( [ cudf.read_parquet( - pof, + path, engine="cudf", columns=columns, row_groups=row_groups[i] if row_groups else None, dataset_kwargs=dataset_kwargs, categorical_partitions=False, + filesystem=fs, **kwargs, ) - for i, pof in enumerate(paths_or_fobs) + for i, path in enumerate(paths) ] ) else: diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index ed291ef31a7..3e078c47cdd 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -386,3 +386,14 @@ def test_issue_12773(): expected.to_pandas(), check_index=False, ) + + +@pytest.mark.parametrize( + "typ", [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype] +) +def test_merge_on_decimal(typ): + df = cudf.DataFrame({"a": [1], "b": [2]}, dtype=typ(1)) + ddf = dask_cudf.from_cudf(df, npartitions=1) + result = ddf.merge(ddf, left_on="a", right_on="a") + expected = df.merge(df, left_on="a", right_on="a") + dd.assert_eq(result, expected) diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt new file mode 100644 index 00000000000..09c7ed2e217 --- /dev/null +++ b/python/libcudf/CMakeLists.txt @@ -0,0 +1,58 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) + +include(../../rapids_config.cmake) + +project( + libcudf-python + VERSION "${RAPIDS_VERSION}" + LANGUAGES CXX +) + +# Check if cudf is already available. If so, it is the user's responsibility to ensure that the +# CMake package is also available at build time of the Python cudf package. +find_package(cudf "${RAPIDS_VERSION}") + +if(cudf_FOUND) + return() +endif() + +unset(cudf_FOUND) + +# For wheels, this should always be true +set(USE_LIBARROW_FROM_PYARROW ON) + +# Find Python early so that later commands can use it +find_package(Python 3.10 REQUIRED COMPONENTS Interpreter) + +set(BUILD_TESTS OFF) +set(BUILD_BENCHMARKS OFF) +set(CUDF_BUILD_TESTUTIL OFF) +set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +set(CUDA_STATIC_RUNTIME ON) + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) + +include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake) + +add_subdirectory(../../cpp cudf-cpp) + +# Ensure other libraries needed by libcudf.so get installed alongside it. +include(cmake/Modules/WheelHelpers.cmake) +install_aliased_imported_targets( + TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp + DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} +) diff --git a/python/libcudf/LICENSE b/python/libcudf/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/libcudf/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/libcudf/README.md b/python/libcudf/README.md new file mode 120000 index 00000000000..fe840054137 --- /dev/null +++ b/python/libcudf/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/python/pylibcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake similarity index 100% rename from python/pylibcudf/cmake/Modules/WheelHelpers.cmake rename to python/libcudf/cmake/Modules/WheelHelpers.cmake diff --git a/python/libcudf/libcudf/VERSION b/python/libcudf/libcudf/VERSION new file mode 120000 index 00000000000..d62dc733efd --- /dev/null +++ b/python/libcudf/libcudf/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py new file mode 100644 index 00000000000..10c476cbe89 --- /dev/null +++ b/python/libcudf/libcudf/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from libcudf._version import __git_commit__, __version__ +from libcudf.load import load_library diff --git a/python/libcudf/libcudf/_version.py b/python/libcudf/libcudf/_version.py new file mode 100644 index 00000000000..7dd732b4905 --- /dev/null +++ b/python/libcudf/libcudf/_version.py @@ -0,0 +1,33 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files(__package__) + .joinpath("VERSION") + .read_text() + .strip() +) +try: + __git_commit__ = ( + importlib.resources.files(__package__) + .joinpath("GIT_COMMIT") + .read_text() + .strip() + ) +except FileNotFoundError: + __git_commit__ = "" + +__all__ = ["__git_commit__", "__version__"] diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py new file mode 100644 index 00000000000..f6ba0d51bdb --- /dev/null +++ b/python/libcudf/libcudf/load.py @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ctypes +import os + + +def load_library(): + # This is loading the libarrow shared library in situations where it comes from the + # pyarrow package (i.e. when installed as a wheel). + import pyarrow # noqa: F401 + + # Dynamically load libcudf.so. Prefer a system library if one is present to + # avoid clobbering symbols that other packages might expect, but if no + # other library is present use the one in the wheel. + libcudf_lib = None + try: + libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL) + except OSError: + # If neither of these directories contain the library, we assume we are in an + # environment where the C++ library is already installed somewhere else and the + # CMake build of the libcudf Python package was a no-op. + # + # Note that this approach won't work for real editable installs of the libcudf package. + # scikit-build-core has limited support for importlib.resources so there isn't a clean + # way to support that case yet. + for lib_dir in ("lib", "lib64"): + if os.path.isfile( + lib := os.path.join( + os.path.dirname(__file__), lib_dir, "libcudf.so" + ) + ): + libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL) + break + + # The caller almost never needs to do anything with this library, but no + # harm in offering the option since this object at least provides a handle + # to inspect where libcudf was loaded from. + return libcudf_lib diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml new file mode 100644 index 00000000000..fd01f7f6e2f --- /dev/null +++ b/python/libcudf/pyproject.toml @@ -0,0 +1,75 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[build-system] +build-backend = "rapids_build_backend.build" +requires = [ + "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "scikit-build-core[pyproject]>=0.10.0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. + +[project] +name = "libcudf" +dynamic = ["version"] +description = "cuDF - GPU Dataframe (C++)" +readme = { file = "README.md", content-type = "text/markdown" } +authors = [ + { name = "NVIDIA Corporation" }, +] +license = { text = "Apache 2.0" } +requires-python = ">=3.10" +classifiers = [ + "Intended Audience :: Developers", + "Topic :: Database", + "Topic :: Scientific/Engineering", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: C++", + "Environment :: GPU :: NVIDIA CUDA", +] +dependencies = [ + "pyarrow>=16.1.0,<16.2.0a0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. + +[project.urls] +Homepage = "https://github.com/rapidsai/cudf" + +[project.entry-points."cmake.prefix"] +libcudf = "libcudf" + +[tool.scikit-build] +build-dir = "build/{wheel_tag}" +cmake.build-type = "Release" +cmake.version = "CMakeLists.txt" +minimum-version = "build-system.requires" +ninja.make-fallback = true +sdist.reproducible = true +wheel.packages = ["libcudf"] +wheel.install-dir = "libcudf" +wheel.py-api = "py3" + +[tool.scikit-build.metadata.version] +provider = "scikit_build_core.metadata.regex" +input = "libcudf/VERSION" +regex = "(?P.*)" + +[tool.rapids-build-backend] +build-backend = "scikit_build_core.build" +dependencies-file = "../../dependencies.yaml" +matrix-entry = "cuda_suffixed=true" +requires = [ + "cmake>=3.26.4,!=3.30.0", + "librmm==24.10.*,>=0.0.0a0", + "ninja", + "pyarrow==16.1.0.*", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index 424d8372280..340ad120377 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -24,72 +24,16 @@ project( LANGUAGES CXX CUDA ) -option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" - OFF -) -option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) - -# Find Python early so that later commands can use it -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) - -# If the user requested it we attempt to find CUDF. -if(FIND_CUDF_CPP) - include(rapids-cpm) - include(rapids-export) - include(rapids-find) - rapids_cpm_init() - - if(USE_LIBARROW_FROM_PYARROW) - # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow - # libraries. These variables have no effect because we are always searching for arrow via - # pyarrow, but they must be set as they are required arguments to the function in - # get_arrow.cmake. - set(CUDF_USE_ARROW_STATIC OFF) - set(CUDF_ENABLE_ARROW_S3 OFF) - set(CUDF_ENABLE_ARROW_ORC OFF) - set(CUDF_ENABLE_ARROW_PYTHON OFF) - set(CUDF_ENABLE_ARROW_PARQUET OFF) - include(../../cpp/cmake/thirdparty/get_arrow.cmake) - endif() - - find_package(cudf "${RAPIDS_VERSION}" REQUIRED) +find_package(cudf "${RAPIDS_VERSION}" REQUIRED) - # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack - # for the interop.pyx - include(../../cpp/cmake/thirdparty/get_dlpack.cmake) -else() - set(cudf_FOUND OFF) -endif() +# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack +# for the interop.pyx +include(rapids-cpm) +rapids_cpm_init() +include(../../cpp/cmake/thirdparty/get_dlpack.cmake) include(rapids-cython-core) -if(NOT cudf_FOUND) - set(BUILD_TESTS OFF) - set(BUILD_BENCHMARKS OFF) - set(CUDF_BUILD_TESTUTIL OFF) - set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) - set(CUDA_STATIC_RUNTIME ON) - - add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL) - - # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous - # components like headers from libcudacxx, but we do need the libraries. However, we want to - # control where they are installed to. Since there are multiple subpackages of pylibcudf that - # require access to libcudf, we place the library and all its dependent artifacts in the cudf - # directory as a single source of truth and modify the other rpaths appropriately. - set(cython_lib_dir pylibcudf) - include(cmake/Modules/WheelHelpers.cmake) - # TODO: This install is currently overzealous. We should only install the libraries that are - # downloaded by CPM during the build, not libraries that were found on the system. However, in - # practice right this would only be a problem is if libcudf was not found but some of the - # dependencies were, and we have no real use cases where that happens. - install_aliased_imported_targets( - TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp - DESTINATION ${cython_lib_dir} - ) -endif() - rapids_cython_init() include(cmake/Modules/LinkPyarrowHeaders.cmake) diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index ab21bfe97ab..f81a32e07f9 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -53,6 +53,8 @@ rapids_cython_create_modules( LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) +target_include_directories(pylibcudf_interop PUBLIC "$") + include(${rapids-cmake-dir}/export/find_package_root.cmake) include(../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 677fdaf80d0..e784c6c6dd5 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -1,5 +1,15 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +# If libcudf was installed as a wheel, we must request it to load the library symbols. +# Otherwise, we assume that the library was installed in a system path that ld can find. +try: + import libcudf +except ModuleNotFoundError: + pass +else: + libcudf.load_library() + del libcudf + from . import ( aggregation, binaryop, diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index ab9b5b99ce2..f65c1034598 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -15,8 +15,8 @@ cpdef TableWithMetadata read_json( list dtypes = *, compression_type compression = *, bool lines = *, - size_type byte_range_offset = *, - size_type byte_range_size = *, + size_t byte_range_offset = *, + size_t byte_range_size = *, bool keep_quotes = *, bool mixed_types_as_string = *, bool prune_columns = *, diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index ce086f4a489..29e49083bc6 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -51,8 +51,8 @@ cdef json_reader_options _setup_json_reader_options( list dtypes, compression_type compression, bool lines, - size_type byte_range_offset, - size_type byte_range_size, + size_t byte_range_offset, + size_t byte_range_size, bool keep_quotes, bool mixed_types_as_string, bool prune_columns, @@ -189,8 +189,8 @@ cpdef TableWithMetadata read_json( list dtypes = None, compression_type compression = compression_type.AUTO, bool lines = False, - size_type byte_range_offset = 0, - size_type byte_range_size = 0, + size_t byte_range_offset = 0, + size_t byte_range_size = 0, bool keep_quotes = False, bool mixed_types_as_string = False, bool prune_columns = False, @@ -212,9 +212,9 @@ cpdef TableWithMetadata read_json( (column_child_name, column_child_type, list of grandchild dtypes). compression: CompressionType, default CompressionType.AUTO The compression format of the JSON source. - byte_range_offset : size_type, default 0 + byte_range_offset : size_t, default 0 Number of bytes to skip from source start. - byte_range_size : size_type, default 0 + byte_range_size : size_t, default 0 Number of bytes to read. By default, will read all bytes. keep_quotes : bool, default False Whether the reader should keep quotes of string values. diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd index 7514e6c5258..1c74f8ca3ac 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd @@ -27,8 +27,8 @@ cdef extern from "cudf/io/json.hpp" \ cudf_io_types.source_info get_source() except + vector[string] get_dtypes() except + cudf_io_types.compression_type get_compression() except + - size_type get_byte_range_offset() except + - size_type get_byte_range_size() except + + size_t get_byte_range_offset() except + + size_t get_byte_range_size() except + bool is_enabled_lines() except + bool is_enabled_mixed_types_as_string() except + bool is_enabled_prune_columns() except + @@ -41,8 +41,8 @@ cdef extern from "cudf/io/json.hpp" \ void set_compression( cudf_io_types.compression_type compression ) except + - void set_byte_range_offset(size_type offset) except + - void set_byte_range_size(size_type size) except + + void set_byte_range_offset(size_t offset) except + + void set_byte_range_size(size_t size) except + void enable_lines(bool val) except + void enable_mixed_types_as_string(bool val) except + void enable_prune_columns(bool val) except + @@ -73,10 +73,10 @@ cdef extern from "cudf/io/json.hpp" \ cudf_io_types.compression_type compression ) except + json_reader_options_builder& byte_range_offset( - size_type offset + size_t offset ) except + json_reader_options_builder& byte_range_size( - size_type size + size_t size ) except + json_reader_options_builder& lines( bool val diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index b037508d03f..63d76e9fd4e 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -19,6 +19,7 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "cuda-python>=11.7.1,<12.0a0", + "libcudf==24.10.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", "pyarrow>=16.1.0,<16.2.0a0", @@ -101,6 +102,8 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", + "libcudf==24.10.*,>=0.0.0a0", + "librmm==24.10.*,>=0.0.0a0", "ninja", "numpy==1.23.*", "pyarrow==16.1.0.*",