diff --git a/.github/labeler.yml b/.github/labeler.yml index 90cdda4d3ca..8506d38a048 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,7 @@ cudf.polars: - 'python/cudf_polars/**' pylibcudf: - - 'python/cudf/pylibcudf/**' + - 'python/pylibcudf/**' libcudf: - 'cpp/**' diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b515dbff9f3..af1538ad0c1 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,6 +30,7 @@ jobs: - wheel-tests-cudf - wheel-build-cudf-polars - wheel-tests-cudf-polars + - cudf-polars-polars-tests - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -244,6 +245,17 @@ jobs: # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars script: "ci/test_wheel_cudf_polars.sh" + cudf-polars-polars-tests: + needs: wheel-build-cudf-polars + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + # This always runs, but only fails if this PR touches code in + # pylibcudf or cudf_polars + script: "ci/test_cudf_polars_polars_tests.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..52a827af94c --- /dev/null +++ b/ci/run_cudf_polars_polars_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +# Assumption, polars has been cloned in the root of the repo. +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/ + +DESELECTED_TESTS=( + "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place + "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode + "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error + "tests/docs/test_user_guide.py" # No dot binary in CI image +) + +DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}") +python -m pytest \ + --import-mode=importlib \ + --cache-clear \ + -m "" \ + -p cudf_polars.testing.plugin \ + -v \ + --tb=short \ + ${DESELECTED_TESTS} \ + "$@" \ + py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..6c728a9537f --- /dev/null +++ b/ci/test_cudf_polars_polars_tests.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -eou pipefail + +# We will only fail these tests if the PR touches code in pylibcudf +# or cudf_polars itself. +# Note, the three dots mean we are doing diff between the merge-base +# of upstream and HEAD. So this is asking, "does _this branch_ touch +# files in cudf_polars/pylibcudf", rather than "are there changes +# between upstream and this branch which touch cudf_polars/pylibcudf" +# TODO: is the target branch exposed anywhere in an environment variable? +if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; +then + HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" +else + HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" +fi + +rapids-logger "Download wheels" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist + +# Download the pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep + +rapids-logger "Install pylibcudf" +python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl + +rapids-logger "Install cudf_polars" +python -m pip install $(echo ./dist/cudf_polars*.whl) + +# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') +TAG="py-1.7.0" +rapids-logger "Clone polars to ${TAG}" +git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 + +# Install requirements for running polars tests +rapids-logger "Install polars test requirements" +python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt + +function set_exitcode() +{ + EXITCODE=$? +} +EXITCODE=0 +trap set_exitcode ERR +set +e + +rapids-logger "Run polars tests" +./ci/run_cudf_polars_polars_tests.sh + +trap ERR +set -e + +if [ ${EXITCODE} != 0 ]; then + rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}" +else + rapids-logger "Running polars test suite PASSED" +fi + +if [ ${HAS_CHANGES} == 1 ]; then + exit ${EXITCODE} +else + exit 0 +fi diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 9844090258a..b4509bba02e 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -13,10 +13,14 @@ set -eou pipefail if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; then HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" else HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" fi +rapids-logger "Download wheels" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist @@ -43,6 +47,9 @@ python -m pip install \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" +rapids-logger "Pin to 1.7.0 Temporarily" +python -m pip install polars==1.7.0 + rapids-logger "Run cudf_polars tests" function set_exitcode() diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bc01e64441..26c086046a8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,7 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/host_tree_algorithms.cu src/io/json/json_column.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index d4368906702..54d177df401 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -85,7 +85,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr); + cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 2d514764fc2..62116ddf661 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -146,11 +147,15 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, cudf::null_equality compare_nulls) { CUDF_FUNC_RANGE(); - constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; - auto const left_selected = left_input.select(left_on); - auto const right_selected = right_input.select(right_on); - auto const [left_join_indices, right_join_indices] = cudf::inner_join( - left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = + cudf::inner_join(left_selected, + right_selected, + compare_nulls, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 442731694fa..9cda22d0695 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -18,6 +18,8 @@ #include "../utilities/timer.hpp" +#include + /** * @file parquet_io.cpp * @brief Demonstrates usage of the libcudf APIs to read and write @@ -159,8 +161,11 @@ int main(int argc, char const** argv) // Left anti-join the original and transcoded tables // identical tables should not throw an exception and // return an empty indices vector - auto const indices = cudf::left_anti_join( - input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); + auto const indices = cudf::left_anti_join(input->view(), + transcoded_input->view(), + cudf::null_equality::EQUAL, + cudf::get_default_stream(), + resource.get()); // No exception thrown, check indices auto const valid = indices->size() == 0; diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index c7523c80b2b..7359a0d5fde 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -17,9 +17,12 @@ #pragma once #include +#include #include #include +#include + #include /** @@ -40,6 +43,7 @@ namespace datetime { * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t years @@ -47,6 +51,7 @@ namespace datetime { */ std::unique_ptr extract_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -54,6 +59,7 @@ std::unique_ptr extract_year( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t months @@ -61,6 +67,7 @@ std::unique_ptr extract_year( */ std::unique_ptr extract_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -68,6 +75,7 @@ std::unique_ptr extract_month( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -75,6 +83,7 @@ std::unique_ptr extract_month( */ std::unique_ptr extract_day( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -82,6 +91,7 @@ std::unique_ptr extract_day( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -89,6 +99,7 @@ std::unique_ptr extract_day( */ std::unique_ptr extract_weekday( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -96,6 +107,7 @@ std::unique_ptr extract_weekday( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t hours @@ -103,6 +115,7 @@ std::unique_ptr extract_weekday( */ std::unique_ptr extract_hour( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -110,6 +123,7 @@ std::unique_ptr extract_hour( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t minutes @@ -117,6 +131,7 @@ std::unique_ptr extract_hour( */ std::unique_ptr extract_minute( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -124,6 +139,7 @@ std::unique_ptr extract_minute( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t seconds @@ -131,6 +147,7 @@ std::unique_ptr extract_minute( */ std::unique_ptr extract_second( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -141,6 +158,7 @@ std::unique_ptr extract_second( * For example, the millisecond fraction of 1.234567890 seconds is 234. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t milliseconds @@ -148,6 +166,7 @@ std::unique_ptr extract_second( */ std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -158,6 +177,7 @@ std::unique_ptr extract_millisecond_fraction( * For example, the microsecond fraction of 1.234567890 seconds is 567. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t microseconds @@ -165,6 +185,7 @@ std::unique_ptr extract_millisecond_fraction( */ std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -175,6 +196,7 @@ std::unique_ptr extract_microsecond_fraction( * For example, the nanosecond fraction of 1.234567890 seconds is 890. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t nanoseconds @@ -182,6 +204,7 @@ std::unique_ptr extract_microsecond_fraction( */ std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group @@ -196,6 +219,7 @@ std::unique_ptr extract_nanosecond_fraction( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS @@ -203,6 +227,7 @@ std::unique_ptr extract_nanosecond_fraction( */ std::unique_ptr last_day_of_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -210,6 +235,7 @@ std::unique_ptr last_day_of_month( * returns an int16_t cudf::column. The value is between [1, {365-366}] * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype INT16 containing the day number since the start of the year @@ -217,6 +243,7 @@ std::unique_ptr last_day_of_month( */ std::unique_ptr day_of_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -245,6 +272,7 @@ std::unique_ptr day_of_year( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::column_view of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of timestamp type containing the computed timestamps @@ -252,6 +280,7 @@ std::unique_ptr day_of_year( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -280,6 +309,7 @@ std::unique_ptr add_calendrical_months( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::scalar of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @return cudf::column of timestamp type containing the computed timestamps @@ -287,6 +317,7 @@ std::unique_ptr add_calendrical_months( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::scalar const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -297,6 +328,7 @@ std::unique_ptr add_calendrical_months( * `output[i] is null` if `column[i]` is null * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype BOOL8 truth value of the corresponding date @@ -304,6 +336,7 @@ std::unique_ptr add_calendrical_months( */ std::unique_ptr is_leap_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -315,11 +348,13 @@ std::unique_ptr is_leap_year( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return cudf::column of datatype INT16 of days in month of the corresponding date */ std::unique_ptr days_in_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -331,11 +366,13 @@ std::unique_ptr days_in_month( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column The input column containing datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return A column of INT16 type indicating which quarter the date is in */ std::unique_ptr extract_quarter( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t { * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round up to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t { std::unique_ptr ceil_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -372,6 +411,7 @@ std::unique_ptr ceil_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round down to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -380,6 +420,7 @@ std::unique_ptr ceil_datetimes( std::unique_ptr floor_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -387,6 +428,7 @@ std::unique_ptr floor_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -395,6 +437,7 @@ std::unique_ptr floor_datetimes( std::unique_ptr round_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 31782cbaf8a..9db7e48498f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf { namespace datetime { namespace detail { /** - * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_year(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_day(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_weekday(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_hour(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_minute(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_second(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_millisecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_microsecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_nanosecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr last_day_of_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr day_of_year(cudf::column_view const& column, rmm::cuda_stream_view stream, @@ -138,9 +135,8 @@ std::unique_ptr day_of_year(cudf::column_view const& column, /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::column_view const& months, @@ -149,9 +145,8 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::scalar const& months, @@ -159,9 +154,9 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti rmm::device_async_resource_ref mr); /** - * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_leap_year(cudf::column_view const& column, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index 5738f9ec8e9..f51d1ba42b2 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -26,14 +27,13 @@ namespace detail { /** * @copydoc cudf::make_timezone_transition_table(std::optional, std::string_view, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, - rmm::cuda_stream_view stream, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace detail diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cc8912cb022..a590eb27511 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -97,6 +97,7 @@ class distinct_hash_join; * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -108,6 +109,7 @@ std::pair>, inner_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -148,6 +151,7 @@ std::pair>, left_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -187,6 +192,7 @@ std::pair>, full_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys, * @param left_keys The left table * @param right_keys The right table * @param compare_nulls Controls whether null join-key values should match or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct @@ -215,6 +222,7 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -236,6 +244,7 @@ std::unique_ptr> left_semi_join( * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A column `left_indices` that can be used to construct @@ -246,6 +255,7 @@ std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -266,6 +276,7 @@ std::unique_ptr> left_anti_join( * * @param left The left table * @param right The right table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @return Result of cross joining `left` and `right` tables @@ -273,6 +284,7 @@ std::unique_ptr> left_anti_join( std::unique_ptr cross_join( cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -567,6 +579,7 @@ class distinct_hash_join { * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -623,6 +638,7 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -655,6 +671,7 @@ conditional_left_join(table_view const& left, * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -665,6 +682,7 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -693,6 +711,7 @@ conditional_full_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -704,6 +723,7 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -732,6 +752,7 @@ std::unique_ptr> conditional_left_semi_join( * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -743,6 +764,7 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -786,6 +808,7 @@ std::unique_ptr> conditional_left_anti_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_inner_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -801,6 +824,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -846,6 +870,7 @@ mixed_inner_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_left_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -861,6 +886,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -906,6 +932,7 @@ mixed_left_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_full_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -921,6 +948,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -956,6 +984,7 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -968,6 +997,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1004,6 +1034,7 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1016,6 +1047,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1041,6 +1073,7 @@ std::unique_ptr> mixed_left_anti_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1056,6 +1089,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1081,6 +1115,7 @@ std::pair>> mixed_in * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1096,6 +1131,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1111,6 +1147,7 @@ std::pair>> mixed_le * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index aa903770e26..f6de1056c24 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -15,9 +15,12 @@ */ #pragma once +#include #include #include +#include + #include #include #include @@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @return The transition table for the given timezone @@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; std::unique_ptr
make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index fd9a6b8f5fe..ddb0dbcd96d 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -580,142 +580,167 @@ std::unique_ptr extract_quarter(column_view const& column, std::unique_ptr ceil_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr); } std::unique_ptr floor_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr); } std::unique_ptr round_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr); } -std::unique_ptr extract_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_year(column, cudf::get_default_stream(), mr); + return detail::extract_year(column, stream, mr); } -std::unique_ptr extract_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_month(column, cudf::get_default_stream(), mr); + return detail::extract_month(column, stream, mr); } -std::unique_ptr extract_day(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_day(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_day(column, cudf::get_default_stream(), mr); + return detail::extract_day(column, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, cudf::get_default_stream(), mr); + return detail::extract_weekday(column, stream, mr); } -std::unique_ptr extract_hour(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_hour(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_hour(column, cudf::get_default_stream(), mr); + return detail::extract_hour(column, stream, mr); } -std::unique_ptr extract_minute(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_minute(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_minute(column, cudf::get_default_stream(), mr); + return detail::extract_minute(column, stream, mr); } -std::unique_ptr extract_second(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_second(column, cudf::get_default_stream(), mr); + return detail::extract_second(column, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_millisecond_fraction(column, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_microsecond_fraction(column, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_nanosecond_fraction(column, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, cudf::get_default_stream(), mr); + return detail::last_day_of_month(column, stream, mr); } -std::unique_ptr day_of_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr day_of_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::day_of_year(column, cudf::get_default_stream(), mr); + return detail::day_of_year(column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::column_view const& months_column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months( - timestamp_column, months_column, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months_column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::scalar const& months, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months, stream, mr); } -std::unique_ptr is_leap_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, cudf::get_default_stream(), mr); + return detail::is_leap_year(column, stream, mr); } -std::unique_ptr days_in_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::days_in_month(column, cudf::get_default_stream(), mr); + return detail::days_in_month(column, stream, mr); } std::unique_ptr extract_quarter(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, cudf::get_default_stream(), mr); + return detail::extract_quarter(column, stream, mr); } } // namespace datetime diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 6498a5e6c55..cf239297255 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::make_timezone_transition_table( - tzif_dir, timezone_name, cudf::get_default_stream(), mr); + return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr); } namespace detail { diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu new file mode 100644 index 00000000000..70d61132b42 --- /dev/null +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::json::detail { + +/** + * @brief Get the column indices for the values column for array of arrays rows + * + * @param row_array_children_level The level of the row array's children + * @param d_tree The tree metadata + * @param col_ids The column ids + * @param num_columns The number of columns + * @param stream The stream to use + * @return The value columns' indices + */ +rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, + tree_meta_t const& d_tree, + device_span col_ids, + size_type const num_columns, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [level2_nodes, level2_indices] = get_array_children_indices( + row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); + auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); + rmm::device_uvector values_column_indices(num_columns, stream); + thrust::scatter(rmm::exec_policy(stream), + level2_indices.begin(), + level2_indices.end(), + col_id_location, + values_column_indices.begin()); + return values_column_indices; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + cudf::get_current_device_resource_ref()); + auto to_host = [stream](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_host_vector_async( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto const h_offsets = cudf::detail::make_host_vector_async( + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), + stream); + stream.synchronize(); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; +} +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.begin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + // destroy references of all child columns after this step, by calling remove_child_columns + }; + + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } + continue; + } + + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + } + + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + + auto this_column_category = column_categories[this_col_id]; + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; + } + + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + return {ignore_vals, columns}; +} + +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 756047d383a..b08fd139113 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -36,23 +35,16 @@ #include #include -#include #include #include #include -#include #include #include #include #include -#include -#include #include #include -#include -#include - namespace cudf::io::json::detail { // DEBUG prints @@ -297,678 +289,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Get the column indices for the values column for array of arrays rows - * - * @param row_array_children_level The level of the row array's children - * @param d_tree The tree metadata - * @param col_ids The column ids - * @param num_columns The number of columns - * @param stream The stream to use - * @return The value columns' indices - */ -rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, - tree_meta_t const& d_tree, - device_span col_ids, - size_type const num_columns, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto [level2_nodes, level2_indices] = get_array_children_indices( - row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); - auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); - rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), - level2_indices.begin(), - level2_indices.end(), - col_id_location, - values_column_indices.begin()); - return values_column_indices; -} - -/** - * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. - * - * @param input String device buffer - * @param node_range_begin Begin offset of the strings - * @param node_range_end End offset of the strings - * @param stream CUDA stream - * @return Vector of strings - */ -std::vector copy_strings_to_host_sync( - device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto const num_strings = node_range_begin.size(); - rmm::device_uvector string_offsets(num_strings, stream); - rmm::device_uvector string_lengths(num_strings, stream); - auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), - d_offset_pairs, - d_offset_pairs + num_strings, - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), - [] __device__(auto const& offsets) { - // Note: first character for non-field columns - return thrust::make_tuple( - static_cast(thrust::get<0>(offsets)), - static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); - }); - - cudf::io::parse_options_view options_view{}; - options_view.quotechar = '\0'; // no quotes - options_view.keepquotes = true; - auto d_offset_length_it = - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); - auto d_column_names = parse_data(input.data(), - d_offset_length_it, - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - cudf::get_current_device_resource_ref()); - auto to_host = [stream](auto const& col) { - if (col.is_empty()) return std::vector{}; - auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_host_vector_async( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_host_vector_async( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - stream.synchronize(); - - // build std::string vector from chars and offsets - std::vector host_data; - host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - return host_data; - }; - return to_host(d_column_names->view()); -} - -/** - * @brief Checks if all strings in each string column in the tree are nulls. - * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as - * false. - * - * @param input Input JSON string device data - * @param d_column_tree column tree representation of JSON string - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param options Parsing options specifying the parsing behaviour - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Array of bytes where each byte indicate if it is all nulls string column. - */ -rmm::device_uvector is_all_nulls_each_column(device_span input, - tree_meta_t const& d_column_tree, - tree_meta_t const& tree, - device_span col_ids, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_cols = d_column_tree.node_categories.size(); - rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = [&]() { - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; - }(); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.begin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_host_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return cudf::detail::make_empty_host_vector(0, stream); - }(); - - // use hash map because we may skip field name's col_ids - std::unordered_map> columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - // for columns that are not mixed type but have been forced as string - std::vector forced_as_string_column(num_columns); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned or if parent - // has been forced as string, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || - forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = 1; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = 1; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - this_column_category = NC_STR; - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - col.forced_as_string_column = true; - forced_as_string_column[this_col_id] = true; - } - - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - } - - // ignore all children of columns forced as string - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { - forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = 1; - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and - forced_as_string_column[this_col_id]) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; - } - }); - - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); -} - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 75639a0438f..83f71e657a7 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -299,22 +299,58 @@ get_array_children_indices(TreeDepthT row_array_children_level, device_span node_levels, device_span parent_node_ids, rmm::cuda_stream_view stream); + /** - * @brief Reduce node tree into column tree by aggregating each property of column. + * @brief Reduces node tree representation to column tree representation. * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns */ std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, - device_span col_ids, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); - +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 748691fb7d1..2ec23e0dc6d 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::INNER_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr); } std::pair>, @@ -395,16 +390,12 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::LEFT_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr); } std::pair>, @@ -412,16 +403,12 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::FULL_JOIN, - {}, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr); } std::unique_ptr> conditional_left_semi_join( @@ -429,16 +416,12 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr); } std::unique_ptr> conditional_left_anti_join( @@ -446,64 +429,56 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr); } std::size_t conditional_inner_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr); } std::size_t conditional_left_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr); } std::size_t conditional_left_semi_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr); } std::size_t conditional_left_anti_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp index 4f6a9484e8c..303442e79ef 100644 --- a/cpp/src/join/conditional_join.hpp +++ b/cpp/src/join/conditional_join.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index eeb49736bac..15594fb60e3 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -75,10 +74,11 @@ std::unique_ptr cross_join(cudf::table_view const& left, std::unique_ptr cross_join(cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, cudf::get_default_stream(), mr); + return detail::cross_join(left, right, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 0abff27667b..7b13c260364 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -120,10 +119,11 @@ std::pair>, inner_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::inner_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -131,10 +131,11 @@ std::pair>, left_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::left_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -142,10 +143,11 @@ std::pair>, full_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::full_join(left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 8ff78dd47f4..820b81ee309 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -484,6 +483,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -495,7 +495,7 @@ mixed_inner_join( compare_nulls, detail::join_kind::INNER_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -506,6 +506,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -516,7 +517,7 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -530,6 +531,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -541,7 +543,7 @@ mixed_left_join( compare_nulls, detail::join_kind::LEFT_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -552,6 +554,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -562,7 +565,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -576,6 +579,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -587,7 +591,7 @@ mixed_full_join( compare_nulls, detail::join_kind::FULL_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index cfb785e242c..aa4fa281159 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -267,6 +266,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -277,7 +277,7 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -288,6 +288,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -298,7 +299,7 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index f69ded73e8d..d2ab2122c75 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -98,22 +97,24 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr); } std::unique_ptr> left_anti_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1bedb344a01..288fa84a73d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -687,10 +687,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index ab387a5c7f5..3431e941359 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -39,6 +39,8 @@ #include #include +#include + #include template @@ -60,6 +62,7 @@ template >, cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr), cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK> std::unique_ptr join_and_gather( @@ -68,12 +71,13 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - join_impl(left_selected, right_selected, compare_nulls, mr); + join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*left_join_indices}; auto right_indices_span = cudf::device_span{*right_join_indices}; @@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture { auto const probe_tv = cudf::table_view{{probe}}; auto const [left_result_map, right_result_map] = - join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref()); + join_func(build_tv, + probe_tv, + nulls_equal, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_result_table = sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy); diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 3e279260b99..554d5754e39 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -28,8 +28,11 @@ #include #include #include +#include #include +#include + #include template @@ -51,6 +54,7 @@ template > (*join_impl)( cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)> std::unique_ptr join_and_gather( cudf::table_view const& left_input, @@ -58,11 +62,12 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); - auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr); + auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*join_indices}; auto left_indices_col = cudf::column_view{left_indices_span}; diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp new file mode 100644 index 00000000000..82629156fa6 --- /dev/null +++ b/cpp/tests/streams/datetime_test.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class DatetimeTest : public cudf::test::BaseFixture { + public: + cudf::test::fixed_width_column_wrapper timestamps{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + cudf::test::fixed_width_column_wrapper months{{1, -1, 3}}; +}; + +TEST_F(DatetimeTest, ExtractYear) +{ + cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMonth) +{ + cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractDay) +{ + cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractWeekday) +{ + cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractHour) +{ + cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMinute) +{ + cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractSecond) +{ + cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMillisecondFraction) +{ + cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMicrosecondFraction) +{ + cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractNanosecondFraction) +{ + cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, LastDayOfMonth) +{ + cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DayOfYear) +{ + cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonths) +{ + cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonthsScalar) +{ + auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream()); + + cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, IsLeapYear) +{ + cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DaysInMonth) +{ + cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractQuarter) +{ + cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, CeilDatetimes) +{ + cudf::datetime::ceil_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, FloorDatetimes) +{ + cudf::datetime::floor_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, RoundDatetimes) +{ + cudf::datetime::round_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp new file mode 100644 index 00000000000..2811bb676fa --- /dev/null +++ b/cpp/tests/streams/join_test.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +class JoinTest : public cudf::test::BaseFixture { + static inline cudf::table make_table() + { + cudf::test::fixed_width_column_wrapper col0{{3, 1, 2, 0, 3}}; + cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}}; + cudf::test::fixed_width_column_wrapper col2{{0, 1, 2, 4, 1}}; + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return cudf::table{std::move(columns)}; + } + + public: + cudf::table table0{make_table()}; + cudf::table table1{make_table()}; + cudf::table conditional0{make_table()}; + cudf::table conditional1{make_table()}; + cudf::ast::column_reference col_ref_left_0{0}; + cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT}; + cudf::ast::operation left_zero_eq_right_zero{ + cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0}; +}; + +TEST_F(JoinTest, InnerJoin) +{ + cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftJoin) +{ + cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, FullJoin) +{ + cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftSemiJoin) +{ + cudf::left_semi_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftAntiJoin) +{ + cudf::left_anti_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); } + +TEST_F(JoinTest, ConditionalInnerJoin) +{ + cudf::conditional_inner_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoin) +{ + cudf::conditional_left_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalFullJoin) +{ + cudf::conditional_full_join( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoin) +{ + cudf::conditional_left_semi_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoin) +{ + cudf::conditional_left_anti_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoin) +{ + cudf::mixed_inner_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoin) +{ + cudf::mixed_left_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedFullJoin) +{ + cudf::mixed_full_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftSemiJoin) +{ + cudf::mixed_left_semi_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftAntiJoin) +{ + cudf::mixed_left_anti_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoinSize) +{ + cudf::mixed_inner_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoinSize) +{ + cudf::mixed_left_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalInnerJoinSize) +{ + cudf::conditional_inner_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoinSize) +{ + cudf::conditional_left_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoinSize) +{ + cudf::conditional_left_semi_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoinSize) +{ + cudf::conditional_left_anti_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} diff --git a/dependencies.yaml b/dependencies.yaml index 7a13043cc5f..2f2d7ba679e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -650,7 +650,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.0,<1.3 + - polars>=1.6 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png new file mode 100644 index 00000000000..e472cf66612 Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png new file mode 100644 index 00000000000..6854ed5a436 Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png new file mode 100644 index 00000000000..d0b48ab2901 Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst new file mode 100644 index 00000000000..cc7aabd124f --- /dev/null +++ b/docs/cudf/source/cudf_polars/index.rst @@ -0,0 +1,41 @@ +cuDF-based GPU backend for Polars [Open Beta] +============================================= + +cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. +The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations +and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine +whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine +and run on the CPU. + +Benchmark +--------- +We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: + +.. figure:: ../_static/pds_benchmark_polars.png + :width: 600px + + + +You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries: + + +.. figure:: ../_static/compute_heavy_queries_polars.png + :width: 1000px + +:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe` + +You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository `__. + +Learn More +---------- + +The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page `__ on the Polars website. + +Launch on Google Colab +---------------------- + +.. figure:: ../_static/colab.png + :width: 200px + :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb + + Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab `__. diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 3b8dfa5fe01..1b86cafeb48 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -29,5 +29,6 @@ other operations. user_guide/index cudf_pandas/index + cudf_polars/index libcudf_docs/index developer_guide/index diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 2518afc80a7..003e7c0c35e 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -14,3 +14,4 @@ strings repeat replace slice + strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst new file mode 100644 index 00000000000..a79774b8e67 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst @@ -0,0 +1,6 @@ +===== +strip +===== + +.. automodule:: pylibcudf.strings.strip + :members: diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst new file mode 100644 index 00000000000..142124163af --- /dev/null +++ b/docs/dask_cudf/source/best_practices.rst @@ -0,0 +1,320 @@ +.. _best-practices: + +Dask cuDF Best Practices +======================== + +This page outlines several important guidelines for using `Dask cuDF +`__ effectively. + +.. note:: + Since Dask cuDF is a backend extension for + `Dask DataFrame `__, + the guidelines discussed in the `Dask DataFrames Best Practices + `__ + documentation also apply to Dask cuDF (excluding any pandas-specific + details). + + +Deployment and Configuration +---------------------------- + +Use Dask-CUDA +~~~~~~~~~~~~~ + +To execute a Dask workflow on multiple GPUs, a Dask cluster must +be deployed with `Dask-CUDA `__ +and `Dask.distributed `__. + +When running on a single machine, the `LocalCUDACluster `__ +convenience function is strongly recommended. No matter how many GPUs are +available on the machine (even one!), using `Dask-CUDA has many advantages +`__ +over default (threaded) execution. Just to list a few: + +* Dask-CUDA makes it easy to pin workers to specific devices. +* Dask-CUDA makes it easy to configure memory-spilling options. +* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time. + +Please see `Dask-CUDA's API `__ +and `Best Practices `__ +documentation for detailed information. Typical ``LocalCUDACluster`` usage +is also illustrated within the multi-GPU section of `Dask cuDF's +`__ documentation. + +.. note:: + When running on cloud infrastructure or HPC systems, it is usually best to + leverage system-specific deployment libraries like `Dask Operator + `__ and `Dask-Jobqueue + `__. + + Please see `the RAPIDS deployment documentation `__ + for further details and examples. + + +Use diagnostic tools +~~~~~~~~~~~~~~~~~~~~ + +The Dask ecosystem includes several diagnostic tools that you should absolutely use. +These tools include an intuitive `browser dashboard +`__ as well as a dedicated +`API for collecting performance profiles +`__. + +No matter the workflow, using the dashboard is strongly recommended. +It provides a visual representation of the worker resources and compute +progress. It also shows basic GPU memory and utilization metrics (under +the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab, +use `NVDashboard `__. + + +Enable cuDF spilling +~~~~~~~~~~~~~~~~~~~~ + +When using Dask cuDF for classic ETL workloads, it is usually best +to enable `native spilling support in cuDF +`__. +When using :func:`LocalCUDACluster`, this is easily accomplished by +setting ``enable_cudf_spill=True``. + +When a Dask cuDF workflow includes conversion between DataFrame and Array +representations, native cuDF spilling may be insufficient. For these cases, +`JIT-unspill `__ +is likely to produce better protection from out-of-memory (OOM) errors. +Please see `Dask-CUDA's spilling documentation +`__ for further details +and guidance. + +Use RMM +~~~~~~~ + +Memory allocations in cuDF are significantly faster and more efficient when +the `RAPIDS Memory Manager (RMM) `__ +library is configured appropriately on worker processes. In most cases, the best way to manage +memory is by initializing an RMM pool on each worker before executing a +workflow. When using :func:`LocalCUDACluster`, this is easily accomplished +by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``). + +See the `Dask-CUDA memory-management documentation +`__ +for more details. + +Use the Dask DataFrame API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although Dask cuDF provides a public ``dask_cudf`` Python module, we +strongly recommended that you use the CPU/GPU portable ``dask.dataframe`` +API instead. Simply `use the Dask configuration system +`__ +to set the ``"dataframe.backend"`` option to ``"cudf"``, and the +``dask_cudf`` module will be imported and used implicitly. + +Be sure to use the :func:`to_backend` method if you need to convert +between the different DataFrame backends. For example:: + + df = df.to_backend("pandas") # This gives us a pandas-backed collection + +.. note:: + Although :func:`to_backend` makes it easy to move data between pandas + and cuDF, repetitive CPU-GPU data movement can degrade performance + significantly. For optimal results, keep your data on the GPU as much + as possible. + +Avoid eager execution +~~~~~~~~~~~~~~~~~~~~~ + +Although Dask DataFrame collections are lazy by default, there are several +notable methods that will result in the immediate execution of the +underlying task graph: + +:func:`compute`: Calling ``ddf.compute()`` will materialize the result of +``ddf`` and return a single cuDF object. This is done by executing the entire +task graph associated with ``ddf`` and concatenating its partitions in +local memory on the client process. + +.. note:: + Never call :func:`compute` on a large collection that cannot fit comfortably + in the memory of a single GPU! + +:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will +execute the entire task graph associated with ``ddf``. The most important +difference is that the computed partitions will remain in distributed +worker memory instead of being concatenated together on the client process. +Another difference is that :func:`persist` will return immediately when +executing on a distributed cluster. If you need a blocking synchronization +point in your workflow, simply use the :func:`wait` function:: + + ddf = ddf.persist() + wait(ddf) + +.. note:: + Avoid calling :func:`persist` on a large collection that cannot fit comfortably + in global worker memory. If the total sum of the partition sizes is larger + than the sum of all GPU memory, calling persist will result in significant + spilling from device memory. If the individual partition sizes are large, this + is likely to produce an OOM error. + +:func:`len` / :func:`head` / :func:`tail`: Although these operations are used +often within pandas/cuDF code to quickly inspect data, it is best to avoid +them in Dask DataFrame. In most cases, these operations will execute some or all +of the underlying task graph to materialize the collection. + +:func:`sort_values` / :func:`set_index` : These operations both require Dask to +eagerly collect quantile information about the column(s) being targeted by the +global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. + +.. note:: + When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the + global collection does not **need** to be sorted by the new index. + +Avoid Sorting +~~~~~~~~~~~~~ + +`The design of Dask DataFrame `__ +makes it advantageous to work with data that is already sorted along its index at +creation time. For most other cases, it is best to avoid sorting unless the logic +of the workflow makes global ordering absolutely necessary. + +If the purpose of a :func:`sort_values` operation is to ensure that all unique +values in ``by`` will be moved to the same output partition, then `shuffle +`__ +is often the better option. + + +Reading Data +------------ + +Tune the partition size +~~~~~~~~~~~~~~~~~~~~~~~ + +The ideal partition size is usually between 1/32 and 1/8 the memory +capacity of a single GPU. Increasing the partition size will typically +reduce the number of tasks in your workflow and improve the GPU utilization +for each task. However, if the partitions are too large, the risk of OOM +errors can become significant. + +.. note:: + As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows + (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically + skewed data distributions, it may be necessary to target 1/64 or smaller. + This rule of thumb comes from anecdotal optimization and OOM-debugging + experience. Since every workflow is different, choosing the best partition + size is both an art and a science. + +The easiest way to tune the partition size is when the DataFrame collection +is first created by a function like :func:`read_parquet`, :func:`read_csv`, +or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv` +expose a ``blocksize`` argument for adjusting the maximum partition size. + +If the partition size cannot be tuned effectively at creation time, the +`repartition `__ +method can be used as a last resort. + + +Use Parquet +~~~~~~~~~~~ + +`Parquet `__ is the recommended +file format for Dask cuDF. It provides efficient columnar storage and enables +Dask to perform valuable query optimizations like column projection and +predicate pushdown. + +The most important arguments to :func:`read_parquet` are ``blocksize`` and +``aggregate_files``: + +``blocksize``: Use this argument to specify the maximum partition size. +The default is `"256 MiB"`, but larger values are usually more performant +on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize`` +value to map a discrete number of Parquet row-groups (or files) to each +output partition. This mapping will only account for the uncompressed +storage size of each row group, which is usually smaller than the +correspondng ``cudf.DataFrame``. + +``aggregate_files``: Use this argument to specify whether Dask should +map multiple files to the same DataFrame partition. The default is +``False``, but ``aggregate_files=True`` is usually more performant when +the dataset contains many files that are smaller than half of ``blocksize``. + +If you know that your files correspond to a reasonable partition size +before splitting or aggregation, set ``blocksize=None`` to disallow +file splitting. In the absence of column-projection pushdown, this will +result in a simple 1-to-1 mapping between files and output partitions. + +.. note:: + If your workflow requires a strict 1-to-1 mapping between files and + partitions, use :func:`from_map` to manually construct your partitions + with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used, + query-planning optimizations may automatically aggregate distinct files + into the same partition (even when ``aggregate_files=False``). + +.. note:: + Metadata collection can be extremely slow when reading from remote + storage (e.g. S3 and GCS). When reading many remote files that all + correspond to a reasonable partition size, use ``blocksize=None`` + to avoid unnecessary metadata collection. + + +Use :func:`from_map` +~~~~~~~~~~~~~~~~~~~~ + +To implement custom DataFrame-creation logic that is not covered by +existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map` +whenever possible. The :func:`from_map` API has several advantages +over :func:`from_delayed`: + +* It allows proper lazy execution of your custom logic +* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument) + +See the `from_map API documentation `__ +for more details. + +.. note:: + Whenever possible, be sure to specify the ``meta`` argument to + :func:`from_map`. If this argument is excluded, Dask will need to + materialize the first partition eagerly. If a large RMM pool is in + use on the first visible device, this eager execution on the client + may lead to an OOM error. + + +Sorting, Joining, and Grouping +------------------------------ + +Sorting, joining, and grouping operations all have the potential to +require the global shuffling of data between distinct partitions. +When the initial data fits comfortably in global GPU memory, these +"all-to-all" operations are typically bound by worker-to-worker +communication. When the data is larger than global GPU memory, the +bottleneck is typically device-to-host memory spilling. + +Although every workflow is different, the following guidelines +are often recommended: + +* `Use a distributed cluster with Dask-CUDA workers `_ +* `Use native cuDF spilling whenever possible `_ +* Avoid shuffling whenever possible + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) +* `Use UCX `__ if communication is a bottleneck. + +.. note:: + UCX enables Dask-CUDA workers to communicate using high-performance + tansport technologies like `NVLink `__ + and Infiniband. Without UCX, inter-process communication will rely + on TCP sockets. + + +User-defined functions +---------------------- + +Most real-world Dask DataFrame workflows use `map_partitions +`__ +to map user-defined functions across every partition of the underlying data. +This API is a fantastic way to apply custom operations in an intuitive and +scalable way. With that said, the :func:`map_partitions` method will produce +an opaque DataFrame expression that blocks the query-planning `optimizer +`__ from performing +useful optimizations (like projection and filter pushdown). + +Since column-projection pushdown is often the most effective optimization, +it is important to select the necessary columns both before and after calling +:func:`map_partitions`. You can also add explicit filter operations to further +mitigate the loss of filter pushdown. diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 7fe6cbd45fa..23ca7e49753 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -15,7 +15,7 @@ as the ``"cudf"`` dataframe backend for .. note:: Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must also deploy a - `dask.distributed ` cluster + `dask.distributed `__ cluster to leverage multiple GPUs. We strongly recommend using `Dask-CUDA `__ to simplify the setup of the cluster, taking advantage of all features of the GPU @@ -29,6 +29,10 @@ minutes to Dask by `10 minutes to cuDF and Dask cuDF `__. +After reviewing the sections below, please see the +:ref:`Best Practices ` page for further guidance on +using Dask cuDF effectively. + Using Dask cuDF --------------- @@ -36,7 +40,7 @@ Using Dask cuDF The Dask DataFrame API (Recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply use the `Dask configuration ` system to +Simply use the `Dask configuration `__ system to set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, this can be achieved like so:: @@ -50,14 +54,14 @@ environment before running your code. Once this is done, the public Dask DataFrame API will leverage ``cudf`` automatically when a new DataFrame collection is created from an on-disk format using any of the following ``dask.dataframe`` -functions:: +functions: -* :func:`dask.dataframe.read_parquet` -* :func:`dask.dataframe.read_json` -* :func:`dask.dataframe.read_csv` -* :func:`dask.dataframe.read_orc` -* :func:`dask.dataframe.read_hdf` -* :func:`dask.dataframe.from_dict` +* :func:`read_parquet` +* :func:`read_json` +* :func:`read_csv` +* :func:`read_orc` +* :func:`read_hdf` +* :func:`from_dict` For example:: @@ -112,8 +116,8 @@ performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). -The explicit Dask cuDF API -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Query Planning +~~~~~~~~~~~~~~ Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the ``"dataframe.query-planning"`` configuration is set to diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index e27c595edda..99e4c21df8a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -599,7 +599,6 @@ cdef class Column: children=tuple(children) ) - # TODO: Actually support exposed data pointers. @staticmethod def from_pylibcudf( col, bint data_ptr_exposed=False @@ -616,7 +615,7 @@ cdef class Column: col : pylibcudf.Column The object to copy. data_ptr_exposed : bool - This parameter is not yet supported + Whether the data buffer is exposed. Returns ------- @@ -639,16 +638,18 @@ cdef class Column: dtype = dtype_from_pylibcudf_column(col) return cudf.core.column.build_column( - data=as_buffer(col.data().obj) if col.data() is not None else None, + data=as_buffer( + col.data().obj, exposed=data_ptr_exposed + ) if col.data() is not None else None, dtype=dtype, size=col.size(), mask=as_buffer( - col.null_mask().obj + col.null_mask().obj, exposed=data_ptr_exposed ) if col.null_mask() is not None else None, offset=col.offset(), null_count=col.null_count(), children=tuple([ - Column.from_pylibcudf(child) + Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) for child in col.children() ]) ) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 483250dd36f..bc5e085ec39 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc + @acquire_spill_lock() def add_months(Column col, Column months): @@ -38,43 +40,9 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - if field == "year": - c_result = move(libcudf_datetime.extract_year(col_view)) - elif field == "month": - c_result = move(libcudf_datetime.extract_month(col_view)) - elif field == "day": - c_result = move(libcudf_datetime.extract_day(col_view)) - elif field == "weekday": - c_result = move(libcudf_datetime.extract_weekday(col_view)) - elif field == "hour": - c_result = move(libcudf_datetime.extract_hour(col_view)) - elif field == "minute": - c_result = move(libcudf_datetime.extract_minute(col_view)) - elif field == "second": - c_result = move(libcudf_datetime.extract_second(col_view)) - elif field == "millisecond": - c_result = move( - libcudf_datetime.extract_millisecond_fraction(col_view) - ) - elif field == "microsecond": - c_result = move( - libcudf_datetime.extract_microsecond_fraction(col_view) - ) - elif field == "nanosecond": - c_result = move( - libcudf_datetime.extract_nanosecond_fraction(col_view) - ) - elif field == "day_of_year": - c_result = move(libcudf_datetime.day_of_year(col_view)) - else: - raise ValueError(f"Invalid datetime field: '{field}'") - - result = Column.from_unique_ptr(move(c_result)) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) + ) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8d463829a19..60a6795a402 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( to_booleans as cpp_to_booleans, ) from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - from_timestamps as cpp_from_timestamps, is_timestamp as cpp_is_timestamp, - to_timestamps as cpp_to_timestamps, -) -from pylibcudf.libcudf.strings.convert.convert_durations cimport ( - from_durations as cpp_from_durations, - to_durations as cpp_to_durations, ) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, @@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id +import pylibcudf as plc + import cudf +from cudf._lib.types cimport dtype_to_pylibcudf_type + def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -522,19 +520,14 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") - cdef column_view input_strings_names = names.view() - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_timestamps( - input_column_view, - c_timestamp_format, - input_strings_names)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.from_timestamps( + input_col.to_pylibcudf(mode="read"), + c_timestamp_format, + names.to_pylibcudf(mode="read") + ) + ) def timestamp2int(Column input_col, dtype, format): @@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format): A Column with string represented in date-time format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.to_timestamps( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_timestamp_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_timestamps( - input_column_view, - out_type, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) def istimestamp(Column input_col, str format): @@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format): A Column with string represented in TimeDelta format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.to_durations( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_durations( - input_column_view, - out_type, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) def int2timedelta(Column input_col, str format): @@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format): """ - cdef column_view input_column_view = input_col.view() cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_durations( - input_column_view, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.from_durations( + input_col.to_pylibcudf(mode="read"), + c_duration_format + ) + ) def int2ip(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index acf52cb7b9f..38ecb21a94c 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() @@ -25,23 +26,14 @@ def strip(Column source_strings, """ cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + return Column.from_pylibcudf( + plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.SideType.BOTH, + repl.c_value + ) ) - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.BOTH, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def lstrip(Column source_strings, diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index baa08a545ec..40d0c9eac3a 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -3,41 +3,26 @@ from numba.np import numpy_support import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils from cython.operator cimport dereference -from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.string cimport string from libcpp.utility cimport move cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform from pylibcudf.expressions cimport Expression from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport ( - bitmask_type, - data_type, - size_type, - type_id, -) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() @@ -46,17 +31,8 @@ def bools_to_mask(Column col): Given an int8 (boolean) column, compress the data from booleans to bits and return a Buffer """ - cdef column_view col_view = col.view() - cdef pair[unique_ptr[device_buffer], size_type] cpp_out - cdef unique_ptr[device_buffer] up_db - - with nogil: - cpp_out = move(libcudf_transform.bools_to_mask(col_view)) - up_db = move(cpp_out.first) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(rmm_db) - return buf + mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) + return as_buffer(mask) @acquire_spill_lock() @@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.Buffer") - cdef bitmask_type* bit_mask = ( - mask_buffer.get_ptr(mode="read") + plc_column = plc_transform.mask_to_bools( + mask_buffer.get_ptr(mode="read"), begin_bit, end_bit ) - - cdef unique_ptr[column] result - with nogil: - result = move( - libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit) - ) - - return Column.from_unique_ptr(move(result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def nans_to_nulls(Column input): - (mask, _) = plc_transform.nans_to_nulls( + mask, _ = plc_transform.nans_to_nulls( input.to_pylibcudf(mode="read") ) return as_buffer(mask) @@ -91,80 +60,45 @@ def nans_to_nulls(Column input): @acquire_spill_lock() def transform(Column input, op): - cdef column_view c_input = input.view() - cdef string c_str - cdef type_id c_tid - cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) - c_str = compiled_op[0].encode('UTF-8') np_dtype = cudf.dtype(compiled_op[1]) - try: - c_tid = ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - np_dtype - ] - ) - c_dtype = data_type(c_tid) - - except KeyError: - raise TypeError( - "Result of window function has unsupported dtype {}" - .format(np_dtype) - ) - - with nogil: - c_output = move(libcudf_transform.transform( - c_input, - c_str, - c_dtype, - True - )) - - return Column.from_unique_ptr(move(c_output)) + plc_column = plc_transform.transform( + input.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True + ) + return Column.from_pylibcudf(plc_column) def table_encode(list source_columns): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef pair[unique_ptr[table], unique_ptr[column]] c_result - - with nogil: - c_result = move(libcudf_transform.encode(c_input)) + plc_table, plc_column = plc_transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + ) return ( - columns_from_unique_ptr(move(c_result.first)), - Column.from_unique_ptr(move(c_result.second)) + [Column.from_pylibcudf(col) for col in plc_table.columns()], + Column.from_pylibcudf(plc_column) ) def one_hot_encode(Column input_column, Column categories): - cdef column_view c_view_input = input_column.view() - cdef column_view c_view_categories = categories.view() - cdef pair[unique_ptr[column], table_view] c_result - - with nogil: - c_result = move( - libcudf_transform.one_hot_encode(c_view_input, c_view_categories) - ) - - # Notice, the data pointer of `owner` has been exposed - # through `c_result.second` at this point. - owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - - pylist_categories = categories.to_arrow().to_pylist() - encodings, _ = data_from_table_view( - move(c_result.second), - owner=owner, - column_names=[ - x if x is not None else '' for x in pylist_categories - ] + plc_table = plc_transform.one_hot_encode( + input_column.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), ) - return encodings + result_columns = [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ] + result_labels = [ + x if x is not None else '' + for x in categories.to_arrow().to_pylist() + ] + return dict(zip(result_labels, result_columns)) @acquire_spill_lock() diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 41d06f8631b..c1317e8f467 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,10 +10,14 @@ from __future__ import annotations +# Check we have a supported polars version +import cudf_polars.utils.versions as v from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir +del v + __all__: list[str] = [ "execute_with_cudf", "translate_ir", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index f31193aa938..76816ee0a61 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -5,19 +5,26 @@ from __future__ import annotations +import contextlib import os import warnings -from functools import partial +from functools import cache, partial from typing import TYPE_CHECKING import nvtx -from polars.exceptions import PerformanceWarning +from polars.exceptions import ComputeError, PerformanceWarning + +import rmm +from rmm._cuda import gpu from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: + from collections.abc import Generator + import polars as pl + from polars import GPUEngine from cudf_polars.dsl.ir import IR from cudf_polars.typing import NodeTraverser @@ -25,23 +32,126 @@ __all__: list[str] = ["execute_with_cudf"] +@cache +def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: + """ + Return the default memory resource for cudf-polars. + + Parameters + ---------- + device + Disambiguating device id when selecting the device. Must be + the active device when this function is called. + + Returns + ------- + rmm.mr.DeviceMemoryResource + The default memory resource that cudf-polars uses. Currently + an async pool resource. + """ + try: + return rmm.mr.CudaAsyncMemoryResource() + except RuntimeError as e: # pragma: no cover + msg, *_ = e.args + if ( + msg.startswith("RMM failure") + and msg.find("not supported with this CUDA driver/runtime version") > -1 + ): + raise ComputeError( + "GPU engine requested, but incorrect cudf-polars package installed. " + "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` " + "and install `cudf-polars-cu11`" + ) from None + else: + raise + + +@contextlib.contextmanager +def set_memory_resource( + mr: rmm.mr.DeviceMemoryResource | None, +) -> Generator[rmm.mr.DeviceMemoryResource, None, None]: + """ + Set the current memory resource for an execution block. + + Parameters + ---------- + mr + Memory resource to use. If `None`, calls :func:`default_memory_resource` + to obtain an mr on the currently active device. + + Returns + ------- + Memory resource used. + + Notes + ----- + At exit, the memory resource is restored to whatever was current + at entry. If a memory resource is provided, it must be valid to + use with the currently active device. + """ + if mr is None: + device: int = gpu.getDevice() + mr = default_memory_resource(device) + previous = rmm.mr.get_current_device_resource() + rmm.mr.set_current_device_resource(mr) + try: + yield mr + finally: + rmm.mr.set_current_device_resource(previous) + + +@contextlib.contextmanager +def set_device(device: int | None) -> Generator[int, None, None]: + """ + Set the device the query is executed on. + + Parameters + ---------- + device + Device to use. If `None`, uses the current device. + + Returns + ------- + Device active for the execution of the block. + + Notes + ----- + At exit, the device is restored to whatever was current at entry. + """ + previous: int = gpu.getDevice() + if device is not None: + gpu.setDevice(device) + try: + yield previous + finally: + gpu.setDevice(previous) + + def _callback( ir: IR, with_columns: list[str] | None, pyarrow_predicate: str | None, n_rows: int | None, + *, + device: int | None, + memory_resource: int | None, ) -> pl.DataFrame: assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + with ( + nvtx.annotate(message="ExecuteIR", domain="cudf_polars"), + # Device must be set before memory resource is obtained. + set_device(device), + set_memory_resource(memory_resource), + ): return ir.evaluate(cache={}).to_polars() def execute_with_cudf( nt: NodeTraverser, *, - raise_on_fail: bool = False, + config: GPUEngine, exception: type[Exception] | tuple[type[Exception], ...] = Exception, ) -> None: """ @@ -52,9 +162,8 @@ def execute_with_cudf( nt NodeTraverser - raise_on_fail - Should conversion raise an exception rather than continuing - without setting a callback. + config + GPUEngine configuration object exception Optional exception, or tuple of exceptions, to catch during @@ -62,9 +171,23 @@ def execute_with_cudf( The NodeTraverser is mutated if the libcudf executor can handle the plan. """ + device = config.device + memory_resource = config.memory_resource + raise_on_fail = config.config.get("raise_on_fail", False) + if unsupported := (config.config.keys() - {"raise_on_fail"}): + raise ValueError( + f"Engine configuration contains unsupported settings {unsupported}" + ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf(partial(_callback, translate_ir(nt))) + nt.set_udf( + partial( + _callback, + translate_ir(nt), + device=device, + memory_resource=memory_resource, + ) + ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index dd3b771e305..3fe3e5557cb 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self: is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) + # TODO: Return Column once #16272 is fixed. + def astype(self, dtype: plc.DataType) -> plc.Column: + """ + Return the backing column as the requested dtype. + + Parameters + ---------- + dtype + Datatype to cast to. + + Returns + ------- + Column of requested type. + + Raises + ------ + RuntimeError + If the cast is unsupported. + + Notes + ----- + This only produces a copy if the requested dtype doesn't match + the current one. + """ + if self.obj.type() != dtype: + return plc.unary.cast(self.obj, dtype) + return self.obj + def copy_metadata(self, from_: pl.Series, /) -> Self: """ Copy metadata from a host series onto self. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index a5c99e2bc11..f3e3862d0cc 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -7,7 +7,7 @@ import itertools from functools import cached_property -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pyarrow as pa import pylibcudf as plc @@ -45,11 +45,19 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" + # If the arrow table has empty names, from_arrow produces + # column_$i. But here we know there is only one such column + # (by construction) and it should have an empty name. + # https://github.com/pola-rs/polars/issues/11632 + # To guarantee we produce correct names, we therefore + # serialise with names we control and rename with that map. + name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} table: pa.Table = plc.interop.to_arrow( self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + [plc.interop.ColumnMetadata(name=name) for name in name_map], ) - return cast(pl.DataFrame, pl.from_arrow(table)).with_columns( + df: pl.DataFrame = pl.from_arrow(table) + return df.rename(name_map).with_columns( *( pl.col(c.name).set_sorted( descending=c.order == plc.types.Order.DESCENDING diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e1b4d30b76b..c401e5a2f17 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -21,8 +21,10 @@ from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa +import pyarrow.compute as pc import pylibcudf as plc +from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import Column, NamedColumn @@ -477,12 +479,6 @@ def __init__( self.options = options self.name = name self.children = children - if ( - self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) - and not self.options[0] - ): - # With ignore_nulls == False, polars uses Kleene logic - raise NotImplementedError(f"Kleene logic for {self.name}") if self.name == pl_expr.BooleanFunction.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): @@ -577,20 +573,31 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.BooleanFunction.Any: + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 - ) - ) - elif self.name == pl_expr.BooleanFunction.All: - (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 - ) - ) + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) @@ -598,13 +605,19 @@ def do_evaluate( (column,) = columns return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: - # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj)) + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsNotNan: - # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj)) + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -654,26 +667,22 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for all_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_AND, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, output_type=self.dtype, ), (c.obj for c in columns), ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for any_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_OR, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, output_type=self.dtype, ), (c.obj for c in columns), @@ -694,7 +703,7 @@ def do_evaluate( class StringFunction(Expr): - __slots__ = ("name", "options", "children") + __slots__ = ("name", "options", "children", "_regex_program") _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -713,12 +722,18 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Uppercase, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -732,11 +747,65 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) elif self.name == pl_expr.StringFunction.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) def do_evaluate( self, @@ -759,12 +828,10 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) @@ -795,6 +862,22 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -825,6 +908,51 @@ def do_evaluate( else prefix.obj, ) ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format.encode() + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format.encode() + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) raise NotImplementedError( f"StringFunction {self.name}" ) # pragma: no cover; handled by init raising @@ -832,6 +960,18 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") + _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + pl_expr.TemporalFunction.Year: "year", + pl_expr.TemporalFunction.Month: "month", + pl_expr.TemporalFunction.Day: "day", + pl_expr.TemporalFunction.WeekDay: "weekday", + pl_expr.TemporalFunction.Hour: "hour", + pl_expr.TemporalFunction.Minute: "minute", + pl_expr.TemporalFunction.Second: "second", + pl_expr.TemporalFunction.Millisecond: "millisecond", + pl_expr.TemporalFunction.Microsecond: "microsecond", + pl_expr.TemporalFunction.Nanosecond: "nanosecond", + } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -846,8 +986,8 @@ def __init__( self.options = options self.name = name self.children = children - if self.name != pl_expr.TemporalFunction.Year: - raise NotImplementedError(f"String function {self.name}") + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") def do_evaluate( self, @@ -861,12 +1001,59 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.TemporalFunction.Year: - (column,) = columns - return Column(plc.datetime.extract_year(column.obj)) - raise NotImplementedError( - f"TemporalFunction {self.name}" - ) # pragma: no cover; init trips first + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) class UnaryFunction(Expr): @@ -874,6 +1061,51 @@ class UnaryFunction(Expr): _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + def __init__( self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr ) -> None: @@ -881,15 +1113,15 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ( - "mask_nans", - "round", - "setsorted", - "unique", - "dropnull", - "fill_null", - ): + + if self.name not in UnaryFunction._supported_fns: raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) def do_evaluate( self, @@ -947,7 +1179,7 @@ def do_evaluate( if maintain_order: return Column(column).sorted_like(values) return Column(column) - elif self.name == "setsorted": + elif self.name == "set_sorted": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -974,7 +1206,7 @@ def do_evaluate( order=order, null_order=null_order, ) - elif self.name == "dropnull": + elif self.name == "drop_nulls": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -994,13 +1226,65 @@ def do_evaluate( ) arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj return Column(plc.replace.replace_nulls(column.obj, arg)) - + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) raise NotImplementedError( f"Unimplemented unary function {self.name=}" ) # pragma: no cover; init trips first def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") if depth == 1: # inside aggregation, need to pre-evaluate, groupby # construction has checked that we don't have nested aggs, @@ -1187,11 +1471,7 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if not ( - plc.traits.is_fixed_width(self.dtype) - and plc.traits.is_fixed_width(value.dtype) - and plc.unary.is_supported_cast(value.dtype, self.dtype) - ): + if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" ) @@ -1255,6 +1535,13 @@ def __init__( req = plc.aggregation.variance(ddof=options) elif name == "count": req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) else: raise NotImplementedError( f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" @@ -1286,9 +1573,18 @@ def __init__( "count", "std", "var", + "quantile", ] ) + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth >= 1: @@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - if self.request is None: + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first @@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: # Ignore nans in these groupby aggs, do this by masking # nans in the input expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, self.request, self)]) + return AggInfo([(expr, request, self)]) def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation @@ -1380,7 +1688,10 @@ def do_evaluate( raise NotImplementedError( f"Agg in context {context}" ) # pragma: no cover; unreachable - (child,) = self.children + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] return self.op(child.evaluate(df, context=context, mapping=mapping)) @@ -1425,6 +1736,11 @@ def __init__( right: Expr, ) -> None: super().__init__(dtype) + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) self.op = op self.children = (left, right) if not plc.binaryop.is_supported_operation( @@ -1436,6 +1752,15 @@ def __init__( f"with output type {self.dtype.id().name}" ) + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e334e6f5cc5..8cd56c8ee3a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,7 +15,6 @@ import dataclasses import itertools -import types from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -28,7 +27,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import Callable, MutableMapping @@ -133,8 +132,7 @@ class IR: def __post_init__(self): """Validate preconditions.""" - if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()): - raise NotImplementedError("Cannot make empty columns.") + pass # noqa: PIE790 def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ @@ -189,32 +187,42 @@ class Scan(IR): """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" - file_options: Any - """Options for reading the file. - - Attributes are: - - ``with_columns: list[str]`` of projected columns to return. - - ``n_rows: int``: Number of rows to read. - - ``row_index: tuple[name, offset] | None``: Add an integer index - column with given name. - """ + with_columns: list[str] + """Projected columns to return.""" + skip_rows: int + """Rows to skip at the start when reading.""" + n_rows: int + """Number of rows to read after skipping.""" + row_index: tuple[str, int] | None + """If not None add an integer index column of the given name.""" predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side raise NotImplementedError(f"Unhandled scan type: {self.typ}") - if self.typ == "ndjson" and self.file_options.n_rows is not None: - raise NotImplementedError("row limit in scan") + if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0): + raise NotImplementedError("row limit in scan for json reader") + if self.skip_rows < 0: + # TODO: polars has this implemented for parquet, + # maybe we can do this too? + raise NotImplementedError("slice pushdown for negative slices") + if self.typ == "csv" and self.skip_rows != 0: # pragma: no cover + # This comes from slice pushdown, but that + # optimization doesn't happen right now + raise NotImplementedError("skipping rows in CSV reader") if self.cloud_options is not None and any( self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp") ): raise NotImplementedError( "Read from cloud storage" ) # pragma: no cover; no test yet + if any(p.startswith("https://") for p in self.paths): + raise NotImplementedError("Read from https") if self.typ == "csv": if self.reader_options["skip_rows_after_header"] != 0: raise NotImplementedError("Skipping rows after header in CSV reader") @@ -242,13 +250,21 @@ def __post_init__(self) -> None: raise NotImplementedError( "ignore_errors is not supported in the JSON reader" ) + elif ( + self.typ == "parquet" + and self.row_index is not None + and self.with_columns is not None + and len(self.with_columns) == 0 + ): + raise NotImplementedError( + "Reading only parquet metadata to produce row index." + ) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - options = self.file_options - with_columns = options.with_columns - row_index = options.row_index - nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1 + with_columns = self.with_columns + row_index = self.row_index + n_rows = self.n_rows if self.typ == "csv": parse_options = self.reader_options["parse_options"] sep = chr(parse_options["separator"]) @@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: eol = chr(parse_options["eol_char"]) if self.reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["inner"].keys()) + column_names = list(self.reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] + read_partial = n_rows != -1 for p in self.paths: skiprows = self.reader_options["skip_rows"] path = Path(p) @@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: comment=comment, decimal=decimal, dtypes=self.schema, - nrows=nrows, + nrows=n_rows, ) pieces.append(tbl_w_meta) + if read_partial: + n_rows -= tbl_w_meta.tbl.num_rows() + if n_rows <= 0: + break tables, colnames = zip( *( (piece.tbl, piece.column_names(include_children=False)) @@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: tbl_w_meta = plc.io.parquet.read_parquet( plc.io.SourceInfo(self.paths), columns=with_columns, - nrows=nrows, + nrows=n_rows, + skip_rows=self.skip_rows, ) df = DataFrame.from_table( tbl_w_meta.tbl, @@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if ( - row_index is not None - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - and row_index[0] in self.schema - ): + if row_index is not None: name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( @@ -481,36 +498,6 @@ def evaluate( return DataFrame(columns) -def placeholder_column(n: int) -> plc.Column: - """ - Produce a placeholder pylibcudf column with NO BACKING DATA. - - Parameters - ---------- - n - Number of rows the column will advertise - - Returns - ------- - pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. - - Notes - ----- - This is used to avoid allocating data for count aggregations. - """ - return plc.Column( - plc.DataType(plc.TypeId.INT8), - n, - plc.gpumemoryview( - types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) - ), - None, - 0, - 0, - [], - ) - - @dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" @@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int: def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" - if self.options.rolling is None and self.maintain_order: - raise NotImplementedError("Maintaining order in groupby") + super().__post_init__() if self.options.rolling: raise NotImplementedError( "rolling window/groupby" @@ -566,6 +552,8 @@ def __post_init__(self) -> None: if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + if len(self.keys) == 0: + raise NotImplementedError("dynamic groupby") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: - col = placeholder_column(df.num_rows) + # A count aggregation, doesn't touch the column, + # but we need to have one. Rather than evaluating + # one, just use one of the key columns. + col = keys[0].obj else: col = pre_eval.evaluate(df).obj requests.append(plc.groupby.GroupByRequest(col, [req])) @@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice) + broadcasted = broadcast(*result_keys, *results) + result_keys = broadcasted[: len(result_keys)] + results = broadcasted[len(result_keys) :] + # Handle order preservation of groups + # like cudf classic does + # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 + if self.maintain_order and not sorted: + left = plc.stream_compaction.stable_distinct( + plc.Table([k.obj for k in keys]), + list(range(group_keys.num_columns())), + plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + right = plc.Table([key.obj for key in result_keys]) + _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + ordered_table = plc.copying.gather( + plc.Table([col.obj for col in broadcasted]), + indices, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + broadcasted = [ + NamedColumn(reordered, b.name) + for reordered, b in zip( + ordered_table.columns(), broadcasted, strict=True + ) + ] + return DataFrame(broadcasted).slice(self.options.slice) @dataclasses.dataclass @@ -627,7 +645,7 @@ class Join(IR): right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], bool, tuple[int, int] | None, str | None, @@ -644,6 +662,7 @@ class Join(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -653,7 +672,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -663,7 +682,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - elif how == "left": + elif how == "left" or how == "right": return ( plc.join.left_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -687,8 +706,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - else: - assert_never(how) + assert_never(how) def _reorder_maps( self, @@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: + if how == "right": + # Right join is a left join with the tables swapped + left, right = right, left + left_on, right_on = right_on, left_on lg, rg = join_fn(left_on.table, right_on.table, null_equality) - if how == "left": + if how == "left" or how == "right": # Order of left table is preserved lg, rg = self._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy @@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) ) right = right.discard_columns(right_on.column_names_set) + if how == "right": + # Undo the swap for right join before gluing together. + left, right = right, left right = right.rename_columns( { name: f"{name}{suffix}" @@ -1065,11 +1090,13 @@ class MapFunction(IR): # "merge_sorted", "rename", "explode", + "unpivot", ] ) def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1086,6 +1113,22 @@ def __post_init__(self) -> None: set(new) & (set(self.df.schema.keys() - set(old))) ): raise NotImplementedError("Duplicate new names in rename.") + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + value_name = "value" if value_name is None else value_name + variable_name = "variable" if variable_name is None else variable_name + if len(pivotees) == 0: + index = frozenset(indices) + pivotees = [name for name in self.df.schema if name not in index] + if not all( + dtypes.can_cast(self.df.schema[p], self.schema[value_name]) + for p in pivotees + ): + raise NotImplementedError( + "Unpivot cannot cast all input columns to " + f"{self.schema[value_name].id()}" + ) + self.options = (indices, pivotees, variable_name, value_name) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + npiv = len(pivotees) + df = self.df.evaluate(cache=cache) + index_columns = [ + NamedColumn(col, name) + for col, name in zip( + plc.reshape.tile(df.select(indices).table, npiv).columns(), + indices, + strict=True, + ) + ] + (variable_column,) = plc.filling.repeat( + plc.Table( + [ + plc.interop.from_arrow( + pa.array( + pivotees, + type=plc.interop.to_arrow(self.schema[variable_name]), + ), + ) + ] + ), + df.num_rows, + ).columns() + value_column = plc.concatenate.concatenate( + [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + ) + return DataFrame( + [ + *index_columns, + NamedColumn(variable_column, variable_name), + NamedColumn(value_column, value_name), + ] + ) else: raise AssertionError("Should never be reached") # pragma: no cover @@ -1122,6 +1200,7 @@ class Union(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise NotImplementedError("Schema mismatch") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 6dc97c7cb51..45881afe0c8 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -75,13 +75,12 @@ def _translate_ir( def _( node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.PythonScan( - schema, - node.options, - translate_named_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, + scan_fn, with_columns, source_type, predicate, nrows = node.options + options = (scan_fn, with_columns, source_type, nrows) + predicate = ( + translate_named_expr(visitor, n=predicate) if predicate is not None else None ) + return ir.PythonScan(schema, options, predicate) @_translate_ir.register @@ -94,13 +93,35 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) + if ( + typ == "csv" + and visitor.version()[0] == 1 + and reader_options["schema"] is not None + ): + reader_options["schema"] = { + "fields": reader_options["schema"]["inner"] + } # pragma: no cover; CI tests 1.7 + file_options = node.file_options + with_columns = file_options.with_columns + n_rows = file_options.n_rows + if n_rows is None: + n_rows = -1 # All rows + skip_rows = 0 # Don't skip + else: + # TODO: with versioning, rename on the rust side + skip_rows, n_rows = n_rows + + row_index = file_options.row_index return ir.Scan( schema, typ, reader_options, cloud_options, node.paths, - node.file_options, + with_columns, + skip_rows, + n_rows, + row_index, translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, @@ -293,10 +314,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + # Polars 1.7 changes definition of the CSV reader options schema name. + if (version := visitor.version()) >= (3, 0): + raise NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + with ctx: + polars_schema = visitor.get_schema() node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} - return _translate_ir(node, visitor, schema) + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + result = _translate_ir(node, visitor, schema) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + raise NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + return result def translate_named_expr( @@ -345,6 +384,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): + if name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = (translate_expr(visitor, n=n) for n in node.input) + if isinstance(chars, expr.Literal): + if chars.value == pa.scalar(""): + # No-op in polars, but libcudf uses empty string + # as signifier to remove whitespace. + return column + elif chars.value == pa.scalar(None): + # Polars uses None to mean "strip all whitespace" + chars = expr.Literal( + column.dtype, + pa.scalar("", type=plc.interop.to_arrow(column.dtype)), + ) + return expr.StringFunction(dtype, name, options, column, chars) return expr.StringFunction( dtype, name, @@ -369,19 +426,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex *(translate_expr(visitor, n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): - return expr.TemporalFunction( + # functions for which evaluation of the expression may not return + # the same dtype as polars, either due to libcudf returning a different + # dtype, or due to our internal processing affecting what libcudf returns + needs_cast = { + pl_expr.TemporalFunction.Year, + pl_expr.TemporalFunction.Month, + pl_expr.TemporalFunction.Day, + pl_expr.TemporalFunction.WeekDay, + pl_expr.TemporalFunction.Hour, + pl_expr.TemporalFunction.Minute, + pl_expr.TemporalFunction.Second, + pl_expr.TemporalFunction.Millisecond, + } + result_expr = expr.TemporalFunction( dtype, name, options, *(translate_expr(visitor, n=n) for n in node.input), ) + if name in needs_cast: + return expr.Cast(dtype, result_expr) + return result_expr + elif isinstance(name, str): - return expr.UnaryFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) + children = (translate_expr(visitor, n=n) for n in node.input) + if name == "log": + (base,) = options + (child,) = children + return expr.BinOp( + dtype, + plc.binaryop.BinaryOperator.LOG_BASE, + child, + expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))), + ) + elif name == "pow": + return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) + return expr.UnaryFunction(dtype, name, options, *children) raise NotImplementedError( f"No handler for Expr function node with {name=}" ) # pragma: no cover; polars raises on the rust side for now diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index d37c96a15de..a79d45899cd 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -5,12 +5,11 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING +from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -77,21 +76,13 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ - if collect_kwargs is None: - collect_kwargs = {} - final_polars_collect_kwargs = collect_kwargs.copy() - final_cudf_collect_kwargs = collect_kwargs.copy() - if polars_collect_kwargs is not None: - final_polars_collect_kwargs.update(polars_collect_kwargs) - if cudf_collect_kwargs is not None: # pragma: no cover - # exclude from coverage since not used ATM - # but this is probably still useful - final_cudf_collect_kwargs.update(cudf_collect_kwargs) - expect = lazydf.collect(**final_polars_collect_kwargs) - got = lazydf.collect( - **final_cudf_collect_kwargs, - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True), + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs ) + + expect = lazydf.collect(**final_polars_collect_kwargs) + engine = GPUEngine(raise_on_fail=True) + got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) assert_frame_equal( expect, got, @@ -134,3 +125,94 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") + + +def _process_kwargs( + collect_kwargs: dict[OptimizationArgs, bool] | None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None, +) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]: + if collect_kwargs is None: + collect_kwargs = {} + final_polars_collect_kwargs = collect_kwargs.copy() + final_cudf_collect_kwargs = collect_kwargs.copy() + if polars_collect_kwargs is not None: # pragma: no cover; not currently used + final_polars_collect_kwargs.update(polars_collect_kwargs) + if cudf_collect_kwargs is not None: # pragma: no cover; not currently used + final_cudf_collect_kwargs.update(cudf_collect_kwargs) + return final_polars_collect_kwargs, final_cudf_collect_kwargs + + +def assert_collect_raises( + lazydf: pl.LazyFrame, + *, + polars_except: type[Exception] | tuple[type[Exception], ...], + cudf_except: type[Exception] | tuple[type[Exception], ...], + collect_kwargs: dict[OptimizationArgs, bool] | None = None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, +): + """ + Assert that collecting the result of a query raises the expected exceptions. + + Parameters + ---------- + lazydf + frame to collect. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_except + Exception or exceptions polars CPU is expected to raise. + cudf_except + Exception or exceptions polars GPU is expected to raise. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_collect_kwargs + Keyword arguments to pass to collect for execution on polars CPU. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + cudf_collect_kwargs + Keyword arguments to pass to collect for execution on cudf-polars. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + + Returns + ------- + None + If both sides raise the expected exceptions. + + Raises + ------ + AssertionError + If either side did not raise the expected exceptions. + """ + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs + ) + + try: + lazydf.collect(**final_polars_collect_kwargs) + except polars_except: + pass + except Exception as e: + raise AssertionError( + f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + + engine = GPUEngine(raise_on_fail=True) + try: + lazydf.collect(**final_cudf_collect_kwargs, engine=engine) + except cudf_except: + pass + except Exception as e: + raise AssertionError( + f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py new file mode 100644 index 00000000000..c40d59e6d33 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running polars test suite setting GPU engine as default.""" + +from __future__ import annotations + +from functools import partialmethod +from typing import TYPE_CHECKING + +import pytest + +import polars + +if TYPE_CHECKING: + from collections.abc import Mapping + + +def pytest_addoption(parser: pytest.Parser): + """Add plugin-specific options.""" + group = parser.getgroup( + "cudf-polars", "Plugin to set GPU as default engine for polars tests" + ) + group.addoption( + "--cudf-polars-no-fallback", + action="store_true", + help="Turn off fallback to CPU when running tests (default use fallback)", + ) + + +def pytest_configure(config: pytest.Config): + """Enable use of this module as a pytest plugin to enable GPU collection.""" + no_fallback = config.getoption("--cudf-polars-no-fallback") + collect = polars.LazyFrame.collect + engine = polars.GPUEngine(raise_on_fail=no_fallback) + polars.LazyFrame.collect = partialmethod(collect, engine=engine) + config.addinivalue_line( + "filterwarnings", + "ignore:.*GPU engine does not support streaming or background collection", + ) + config.addinivalue_line( + "filterwarnings", + "ignore:.*Query execution with GPU not supported", + ) + + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", + "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", + "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", + "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", + "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", + "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported", + "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", + "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", + "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", + "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", + "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", + "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", + "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", + "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + # Maybe flaky, order-dependent? + "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", + "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", +} + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: list[pytest.Item] +): + """Mark known failing tests.""" + if config.getoption("--cudf-polars-no-fallback"): + # Don't xfail tests if running without fallback + return + for item in items: + if item.nodeid in EXPECTED_FAILURES: + item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index adab10bdded..240b11bdf59 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr: """Convert the given expression to python rep.""" ... + def version(self) -> tuple[int, int]: + """The IR version as `(major, minor)`.""" + ... + def set_udf( self, callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 7f6ea1edfd9..4154a404e98 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,7 +13,7 @@ import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists"] +__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ +def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if casting is supported, False otherwise + """ + return ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: """ diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 9807cffb384..2e6efde968c 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,18 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") -POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") -POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") -POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1") -POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") -POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") -POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") - -POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") -POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") -POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") -POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") - -if POLARS_VERSION < parse("1.0"): # pragma: no cover - raise ImportError("cudf_polars requires py-polars v1.0 or greater.") +POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6") +POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6") +POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6") + +if POLARS_VERSION_LT_16: + raise ImportError( + "cudf_polars requires py-polars v1.6 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 6cd36136bf8..103ac1a674e 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -15,8 +15,10 @@ You will need: ## Installing polars -We will need to build polars from source. Until things settle down, -live at `HEAD`. +`cudf-polars` works with polars >= 1.3, as long as the internal IR +version doesn't get a major version bump. So `pip install polars>=1.3` +should work. For development, if we're adding things to the polars +side of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -uv pip install --no-build-isolation --no-deps -e . +pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -69,16 +71,18 @@ pytest -v tests # Executor design -The polars `LazyFrame.collect` functionality offers a -"post-optimization" callback that may be used by a third party library -to replace a node (or more, though we only replace a single node) in the -optimized logical plan with a Python callback that is to deliver the -result of evaluating the plan. This splits the execution of the plan -into two phases. First, a symbolic phase which translates to our -internal representation (IR). Second, an execution phase which executes -using our IR. - -The translation phase receives the a low-level Rust `NodeTraverse` +The polars `LazyFrame.collect` functionality offers configuration of +the engine to use for collection through the `engine` argument. At a +low level, this provides for configuration of a "post-optimization" +callback that may be used by a third party library to replace a node +(or more, though we only replace a single node) in the optimized +logical plan with a Python callback that is to deliver the result of +evaluating the plan. This splits the execution of the plan into two +phases. First, a symbolic phase which translates to our internal +representation (IR). Second, an execution phase which executes using +our IR. + +The translation phase receives the a low-level Rust `NodeTraverser` object which delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if @@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at all: if we can translate the IR, it is assumed that evaluation will later succeed. -The usage of the cudf-based executor is therefore, at present: +The usage of the cudf-based executor is therefore selected with the +gpu engine: ```python -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect(post_opt_callback=execute_with_cudf) +result = q.collect(engine="gpu") ``` This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU -execution. +execution. If `POLARS_VERBOSE` is true, then fallback is logged with a +`PerformanceWarning`. -If you want to fail during translation, set the keyword argument -`raise_on_fail` to `True`: +As well as a string argument, the engine can also be specified with a +polars `GPUEngine` object. This allows passing more configuration in. +Currently, the public properties are `device`, to select the device, +and `memory_resource`, to select the RMM memory resource used for +allocations during the collection phase. +For example: ```python -from functools import partial -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect( - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) -) +result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr)) +``` + +Uses device-1, and the given memory resource. Note that the memory +resource provided _must_ be valid for allocations on the specified +device, no checking is performed. + +For debugging purposes, we can also pass undocumented keyword +arguments, at the moment, `raise_on_fail` is also supported, which +raises, rather than falling back, during translation: + +```python + +result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` This is mostly useful when writing tests, since in that case we want any failures to propagate, rather than falling back to the CPU mode. +## IR versioning + +On the polars side, the `NodeTraverser` object advertises an internal +version (via `NodeTraverser.version()` as a `(major, minor)` tuple). +`minor` version bumps are for backwards compatible changes (e.g. +exposing new nodes), whereas `major` bumps are for incompatible +changes. We can therefore attempt to detect the IR version +(independently of the polars version) and dispatch, or error +appropriately. This should be done during IR translation in +`translate.py`. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` a `Column` with an additional name +3. `NamedColumn` (a `Column` with an additional name) 4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 984b5487b98..857a8c14b2f 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.0,<1.3", + "polars>=1.6", "pylibcudf==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -58,6 +58,9 @@ exclude_also = [ "class .*\\bProtocol\\):", "assert_never\\(" ] +# The cudf_polars test suite doesn't exercise the plugin, so we omit +# it from coverage checks. +omit = ["cudf_polars/testing/plugin.py"] [tool.ruff] line-length = 88 diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 6b470268084..39fb44d55a5 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -9,6 +9,7 @@ import polars as pl from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): @@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last): assert b.null_order == b_null_order assert c.is_sorted == plc.types.Sorted.NO assert df.flags == gf.to_polars().flags + + +def test_empty_name_roundtrips_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]}) + assert_gpu_result_equal(df) + + +def test_empty_name_roundtrips_no_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]}) + assert_gpu_result_equal(df) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 245bde3acab..56055f4c6c2 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -7,15 +7,38 @@ import polars as pl from cudf_polars.dsl import expr -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) -@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +@pytest.fixture( + params=[ + # regular aggs from Agg + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + # scan aggs from UnaryFunction + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + ] +) def agg(request): return request.param -@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16]) def dtype(request): return request.param @@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted): if is_sorted: values = sorted(values, key=lambda x: -1000 if x is None else x) + if dtype.is_unsigned_integer(): + values = pl.Series(values).abs() + if is_sorted: + values = values.sort() + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) if is_sorted: return df.set_sorted("a") @@ -52,6 +80,51 @@ def test_agg(df, agg): assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) +def test_bool_agg(agg, request): + if agg == "cum_min" or agg == "cum_max": + pytest.skip("Does not apply") + request.applymarker( + pytest.mark.xfail( + condition=agg == "n_unique", + reason="Wrong dtype we get Int32, polars gets UInt32", + ) + ) + df = pl.LazyFrame({"a": [True, False, None, True]}) + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +def test_cum_agg_reverse_unsupported(cum_agg): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = getattr(pl.col("a"), cum_agg)(reverse=True) + q = df.select(expr) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)]) +@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"]) +def test_quantile(df, q, interp): + expr = pl.col("a").quantile(q, interp) + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtypes = q.collect_schema()["a"] == pl.Float64 + if not check_dtypes: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + + +def test_quantile_invalid_q(df): + expr = pl.col("a").quantile(pl.col("a")) + q = df.select(expr) + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 97421008669..2347021c40e 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -17,15 +17,11 @@ def has_nulls(request): return request.param -@pytest.mark.parametrize( - "ignore_nulls", - [ - pytest.param( - False, marks=pytest.mark.xfail(reason="No support for Kleene logic") - ), - True, - ], -) +@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"]) +def ignore_nulls(request): + return request.param + + def test_booleanfunction_reduction(ignore_nulls): ldf = pl.LazyFrame( { @@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls): assert_gpu_result_equal(query) +@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all]) +def test_booleanfunction_all_any_kleene(expr, ignore_nulls): + ldf = pl.LazyFrame( + { + "a": [False, None], + "b": [False, False], + "c": [False, True], + "d": [None, False], + "e": pl.Series([None, None], dtype=pl.Boolean()), + "f": [None, True], + "g": [True, False], + "h": [True, None], + "i": [True, True], + } + ) + q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls)) + assert_gpu_result_equal(q) + + @pytest.mark.parametrize( "expr", [ @@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls): ids=lambda f: f"{f.__name__}()", ) @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"]) -def test_boolean_function_unary(request, expr, has_nans, has_nulls): - if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan): - request.applymarker( - pytest.mark.xfail( - reason="Need to copy null mask since is_{not_}nan(null) => null" - ) - ) - +def test_boolean_function_unary(expr, has_nans, has_nulls): values: list[float | None] = [1, 2, 3, 4, 5] if has_nans: values[3] = float("nan") @@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds): "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"] ) @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"]) -def test_boolean_horizontal(request, expr, has_nulls, wide): - if has_nulls: - request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic")) +def test_boolean_horizontal(expr, has_nulls, wide): ldf = pl.LazyFrame( { "a": [False, False, False, False, False, True], @@ -164,6 +170,18 @@ def test_boolean_is_in(expr): assert_gpu_result_equal(q) +@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor]) +def test_boolean_kleene_logic(expr): + ldf = pl.LazyFrame( + { + "a": [False, False, False, None, None, None, True, True, True], + "b": [False, None, True, False, None, True, False, None, True], + } + ) + q = ldf.select(expr(pl.col("a"), pl.col("b"))) + assert_gpu_result_equal(q) + + def test_boolean_is_in_raises_unsupported(): ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)}) q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32()))) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 218101bf87c..c6ea29ddd38 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -9,7 +9,11 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.dsl.expr import TemporalFunction +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.mark.parametrize( @@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype): assert_gpu_result_equal(query) +datetime_extract_fields = [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +@pytest.fixture( + ids=datetime_extract_fields, + params=[methodcaller(f) for f in datetime_extract_fields], +) +def field(request): + return request.param + + +def test_datetime_extract(field): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + q = ldf.select(field(pl.col("datetimes").dt)) + + assert_gpu_result_equal(q) + + +def test_datetime_extra_unsupported(monkeypatch): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + def unsupported_name_setter(self, value): + pass + + def unsupported_name_getter(self): + return "unsupported" + + monkeypatch.setattr( + TemporalFunction, + "name", + property(unsupported_name_getter, unsupported_name_setter), + ) + + q = ldf.select(pl.col("datetimes").dt.nanosecond()) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "field", [ methodcaller("year"), - pytest.param( - methodcaller("day"), - marks=pytest.mark.xfail(reason="day extraction not implemented"), - ), + methodcaller("month"), + methodcaller("day"), + methodcaller("weekday"), ], ) -def test_datetime_extract(field): +def test_date_extract(field): + ldf = pl.LazyFrame( + { + "dates": [ + datetime.date(2024, 1, 1), + datetime.date(2024, 10, 11), + ] + } + ) + ldf = pl.LazyFrame( {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} ) - q = ldf.select(field(pl.col("dates").dt)) - with pytest.raises(AssertionError): - # polars produces int32, libcudf produces int16 for the year extraction - # libcudf can lose data here. - # https://github.com/rapidsai/cudf/issues/16196 - assert_gpu_result_equal(q) + q = ldf.select(field(pl.col("dates").dt)) - assert_gpu_result_equal(q, check_dtypes=False) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py index 6bffa3e252c..f7c5d1bf2cd 100644 --- a/python/cudf_polars/tests/expressions/test_gather.py +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -6,7 +6,6 @@ import polars as pl -from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative): query = ldf.select(pl.col("a").gather(pl.col("b"))) with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=execute_with_cudf) + query.collect(engine="gpu") diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py new file mode 100644 index 00000000000..ac3aecf88e6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import numpy as np +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "exp", + "sqrt", + "cbrt", + "ceil", + "floor", + "abs", + ] +) +def op(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32]) +def dtype(request): + return request.param + + +@pytest.fixture +def ldf(with_nulls, dtype): + values = [1, 2, 4, 5, -2, -4, 0] + if with_nulls: + values.append(None) + if dtype == pl.Float32: + values.append(-float("inf")) + values.append(float("nan")) + values.append(float("inf")) + elif dtype == pl.Int32: + iinfo = np.iinfo("int32") + values.append(iinfo.min) + values.append(iinfo.max) + return pl.LazyFrame( + { + "a": pl.Series(values, dtype=dtype), + "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32), + } + ) + + +def test_unary(ldf, op): + expr = getattr(pl.col("a"), op)() + q = ldf.select(expr) + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("base_literal", [False, True]) +@pytest.mark.parametrize("exponent_literal", [False, True]) +def test_pow(ldf, base_literal, exponent_literal): + base = pl.lit(2) if base_literal else pl.col("a") + exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b") + + q = ldf.select(base.pow(exponent)) + + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("natural", [True, False]) +def test_log(ldf, natural): + if natural: + expr = pl.col("a").log() + else: + expr = pl.col("a").log(10) + + q = ldf.select(expr) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index df08e15baa4..4f6850ac977 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -10,6 +10,7 @@ from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -152,3 +153,187 @@ def test_slice_column(slice_column_data): else: query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.fixture +def to_datetime_data(): + return pl.LazyFrame( + { + "a": [ + "2021-01-01", + "2021-01-02", + "abcd", + ] + } + ) + + +@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}") +@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}") +@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}") +@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}") +def test_to_datetime(to_datetime_data, cache, strict, format, exact): + query = to_datetime_data.select( + pl.col("a").str.strptime( + pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact + ) + ) + if cache or format is None or not exact: + assert_ir_translation_raises(query, NotImplementedError) + elif strict: + assert_collect_raises( + query, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target, repl", + [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")], # noqa: RUF001 +) +@pytest.mark.parametrize("n", [0, 3, -1]) +def test_replace_literal(ldf, target, repl, n): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))]) +def test_replace_literal_unsupported(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_re(ldf): + query = ldf.select(pl.col("a").str.replace("A", "a", literal=False)) + assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.mark.parametrize( + "target,repl", + [ + (["A", "de", "kLm", "awef"], "a"), + (["A", "de", "kLm", "awef"], ""), + (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]), + (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]), + ( + pl.lit(pl.Series(["A", "de", "kLm", "awef"])), + pl.lit(pl.Series(["a", "b", "c", "d"])), + ), + ], +) +def test_replace_many(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target,repl", + [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())], +) +def test_replace_many_notimplemented(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_many_ascii_case(ldf): + query = ldf.select( + pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True) + ) + + assert_ir_translation_raises(query, NotImplementedError) + + +_strip_data = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", + None, +] + + +@pytest.fixture +def strip_ldf(): + return pl.DataFrame({"a": _strip_data}).lazy() + + +@pytest.fixture(params=strip_chars) +def to_strip(request): + return request.param + + +def test_strip_chars(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_start(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_end(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_column(strip_ldf): + q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a"))) + assert_ir_translation_raises(q, NotImplementedError) + + +def test_invalid_regex_raises(): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(r"ab)", strict=True)) + + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) + + +@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"]) +def test_unsupported_regex_raises(pattern): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(pattern, strict=True)) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 5b4bba55552..3c3986be19b 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -6,6 +6,9 @@ import pytest import polars as pl +from polars.testing.asserts import assert_frame_equal + +import rmm from cudf_polars.dsl.ir import IR from cudf_polars.testing.asserts import ( @@ -32,3 +35,48 @@ def raise_unimplemented(self): ): # And ensure that collecting issues the correct warning. assert_gpu_result_equal(q) + + +def test_unsupported_config_raises(): + q = pl.LazyFrame({}) + + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(unknown_key=True)) + + +@pytest.mark.parametrize("device", [-1, "foo"]) +def test_invalid_device_raises(device): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(device=device)) + + +@pytest.mark.parametrize("mr", [1, object()]) +def test_invalid_memory_resource_raises(mr): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(memory_resource=mr)) + + +def test_explicit_device_zero(): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + result = q.collect(engine=pl.GPUEngine(device=0)) + assert_frame_equal(q.collect(), result) + + +def test_explicit_memory_resource(): + upstream = rmm.mr.CudaMemoryResource() + n_allocations = 0 + + def allocate(bytes, stream): + nonlocal n_allocations + n_allocations += 1 + return upstream.allocate(bytes, stream) + + mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate) + + q = pl.LazyFrame({"a": [1, 2, 3]}) + result = q.collect(engine=pl.GPUEngine(memory_resource=mr)) + assert_frame_equal(q.collect(), result) + assert n_allocations > 0 diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index a75825ef3d3..6f996e0e0ec 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture @@ -31,6 +30,7 @@ def df(): params=[ [pl.col("key1")], [pl.col("key2")], + [pl.col("key1"), pl.lit(1)], [pl.col("key1") * pl.col("key2")], [pl.col("key1"), pl.col("key2")], [pl.col("key1") == pl.col("key2")], @@ -52,6 +52,7 @@ def keys(request): [(pl.col("float") - pl.lit(2)).max()], [pl.col("float").sum().round(decimals=1)], [pl.col("float").round(decimals=1).sum()], + [pl.col("int").first(), pl.col("float").last()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -60,15 +61,7 @@ def exprs(request): @pytest.fixture( - params=[ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="Maintaining order in groupby not implemented" - ), - ), - ], + params=[False, True], ids=["no_maintain_order", "maintain_order"], ) def maintain_order(request): @@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): # Multiple keys don't do sorting qsorted = q.sort(*sort_keys) if len(keys) > 1: - with pytest.raises(AssertionError): - # https://github.com/pola-rs/polars/issues/17556 - assert_gpu_result_equal(q, check_exact=False) - if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): - # https://github.com/pola-rs/polars/issues/17557 - with pytest.raises(AssertionError): - assert_gpu_result_equal(qsorted, check_exact=False) - else: - assert_gpu_result_equal(qsorted, check_exact=False) + # https://github.com/pola-rs/polars/issues/17556 + # Can't assert that the query without post-sorting fails, + # since it _might_ pass. + assert_gpu_result_equal(qsorted, check_exact=False) elif schema[sort_keys[0]] == pl.Boolean(): # Boolean keys don't do sorting, so we get random order assert_gpu_result_equal(qsorted, check_exact=False) @@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr): assert_ir_translation_raises(q, NotImplementedError) +def test_groupby_null_keys(maintain_order): + df = pl.LazyFrame( + { + "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()), + "value": [-1, 2, 1, 2, 3, 4], + } + ) + + q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min()) + if not maintain_order: + q = q.sort("key") + + assert_gpu_result_equal(q) + + @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") def test_groupby_minmax_with_nan(): df = pl.LazyFrame( @@ -159,15 +162,7 @@ def test_groupby_nan_minmax_raises(op): @pytest.mark.parametrize( "key", - [ - pytest.param( - 1, - marks=pytest.mark.xfail( - versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this" - ), - ), - pl.col("key1"), - ], + [1, pl.col("key1")], ) @pytest.mark.parametrize( "expr", @@ -183,3 +178,12 @@ def test_groupby_literal_in_agg(df, key, expr): # so just sort by the group key q = df.group_by(key).agg(expr).sort(key, maintain_order=True) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()], +) +def test_groupby_unary_non_pointwise_raises(df, expr): + q = df.group_by("key1").agg(expr) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py new file mode 100644 index 00000000000..38b3ce74ac5 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby_dynamic.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from datetime import datetime + +import polars as pl + +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_groupby_dynamic_raises(): + df = pl.LazyFrame( + { + "dt": [ + datetime(2021, 12, 31, 0, 0, 0), + datetime(2022, 1, 1, 0, 0, 1), + datetime(2022, 3, 31, 0, 0, 1), + datetime(2022, 4, 1, 0, 0, 1), + ] + } + ) + + q = ( + df.sort("dt") + .group_by_dynamic("dt", every="1q") + .agg(pl.col("dt").count().alias("num_values")) + ) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 1e880cdc6de..7d9ec98db97 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -17,7 +17,7 @@ def join_nulls(request): return request.param -@pytest.fixture(params=["inner", "left", "semi", "anti", "full"]) +@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"]) def how(request): return request.param diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index 77032108e6f..e895f27f637 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -61,3 +61,48 @@ def test_rename_columns(mapping): q = df.rename(mapping) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]]) +@pytest.mark.parametrize("variable_name", [None, "names"]) +@pytest.mark.parametrize("value_name", [None, "unpivoted"]) +def test_unpivot(index, variable_name, value_name): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot( + ["c", "b"], index=index, variable_name=variable_name, value_name=value_name + ) + + assert_gpu_result_equal(q) + + +def test_unpivot_defaults(): + df = pl.LazyFrame( + { + "a": pl.Series([11, 12, 13], dtype=pl.UInt16), + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot(index="d") + assert_gpu_result_equal(q) + + +def test_unpivot_unsupported_cast_raises(): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + } + ) + + q = df.unpivot(["a", "b"]) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py index fd8453b77c4..0cda89474a8 100644 --- a/python/cudf_polars/tests/test_python_scan.py +++ b/python/cudf_polars/tests/test_python_scan.py @@ -8,7 +8,9 @@ def test_python_scan(): - def source(with_columns, predicate, nrows): + def source(with_columns, predicate, nrows, *batch_size): + # PythonScan interface changes between 1.3 and 1.4 to add an + # extra batch_size argument return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 64acbb076ed..792b136acd8 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture( @@ -58,6 +57,22 @@ def mask(request): return request.param +@pytest.fixture( + params=[ + None, + (1, 1), + ], + ids=[ + "no-slice", + "slice-second", + ], +) +def slice(request): + # For use in testing that we handle + # polars slice pushdown correctly + return request.param + + def make_source(df, path, format): """ Writes the passed polars df to a file of @@ -79,7 +94,9 @@ def make_source(df, path, format): ("parquet", pl.scan_parquet), ], ) -def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request): +def test_scan( + tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request +): name, offset = row_index make_source(df, tmp_path / "file", format) request.applymarker( @@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r row_index_offset=offset, n_rows=n_rows, ) + if slice is not None: + q = q.slice(*slice) if mask is not None: q = q.filter(mask) if columns is not None: q = q.select(*columns) - polars_collect_kwargs = {} - if versions.POLARS_VERSION_LT_12: - # https://github.com/pola-rs/polars/issues/17553 - polars_collect_kwargs = {"projection_pushdown": False} - assert_gpu_result_equal( - q, - polars_collect_kwargs=polars_collect_kwargs, - # This doesn't work in polars < 1.2 since the row-index - # is in the wrong order in previous polars releases - check_column_order=versions.POLARS_VERSION_LT_12, - ) + assert_gpu_result_equal(q) + + +def test_negative_slice_pushdown_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.parquet") + q = pl.scan_parquet(tmp_path / "df.parquet") + # Take the last row + q = q.slice(-1, 1) + assert_ir_translation_raises(q, NotImplementedError) def test_scan_unsupported_raises(tmp_path): @@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df): assert_ir_translation_raises(q, NotImplementedError) -@pytest.mark.xfail( - versions.POLARS_VERSION_LT_11, - reason="https://github.com/pola-rs/polars/issues/15730", -) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) @@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path): ("test*.csv", False), ], ) -def test_scan_csv_multi(tmp_path, filename, glob): +@pytest.mark.parametrize( + "nrows_skiprows", + [ + (None, 0), + (1, 1), + (3, 0), + (4, 2), + ], +) +def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): + n_rows, skiprows = nrows_skiprows with (tmp_path / "test1.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test2.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob) + q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) @@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path): make_source(df, tmp_path / "file", "ndjson") q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True) assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_nested_null_raises(tmp_path): + df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))}) + + df.write_parquet(tmp_path / "file.pq") + + q = pl.scan_parquet(tmp_path / "file.pq") + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_only_row_index_raises(df, tmp_path): + make_source(df, tmp_path / "file", "parquet") + q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index") + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_hf_url_raises(): + q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py index ecc02efd967..cfa8e5ff9b9 100644 --- a/python/cudf_polars/tests/test_sort.py +++ b/python/cudf_polars/tests/test_sort.py @@ -13,10 +13,7 @@ "sort_keys", [ (pl.col("a"),), - pytest.param( - (pl.col("d").abs(),), - marks=pytest.mark.xfail(reason="abs not yet implemented"), - ), + (pl.col("d").abs(),), (pl.col("a"), pl.col("d")), (pl.col("b"),), ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 5bc2fe1efb7..8e7f1a09d9b 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -7,7 +7,10 @@ import polars as pl +from cudf_polars.containers import DataFrame +from cudf_polars.dsl.ir import Select from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -26,10 +29,62 @@ def test_translation_assert_raises(): class E(Exception): pass - unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b")) # Unsupported query should raise NotImplementedError assert_ir_translation_raises(unsupported, NotImplementedError) with pytest.raises(AssertionError): # This should fail, because we can't translate this query, but it doesn't raise E. assert_ir_translation_raises(unsupported, E) + + +def test_collect_assert_raises(monkeypatch): + df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + with pytest.raises(AssertionError): + # This should raise, because polars CPU can run this query + assert_collect_raises( + df, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + # Here's an invalid query that gets caught at IR optimisation time. + q = df.select(pl.col("a") * pl.col("b")) + + # This exception is raised in preprocessing, so is the same for + # both CPU and GPU engines. + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with pytest.raises(AssertionError): + # This should raise because the expected GPU error is wrong + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=NotImplementedError, + ) + + with pytest.raises(AssertionError): + # This should raise because the expected CPU error is wrong + assert_collect_raises( + q, + polars_except=NotImplementedError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with monkeypatch.context() as m: + m.setattr(Select, "evaluate", lambda self, cache: DataFrame([])) + # This query should fail, but we monkeypatch a bad + # implementation of Select which "succeeds" to check that our + # assertion notices this case. + q = df.select(pl.col("a") + pl.Series([1, 2])) + with pytest.raises(AssertionError): + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 4655d2165f0..69e1524be39 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -16,6 +16,7 @@ See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to ## Resources - [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/) - [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) - [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) - [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 0ddc68bcb9d..e8e0caaf42d 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -2,7 +2,19 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year +from pylibcudf.libcudf.datetime cimport ( + day_of_year as cpp_day_of_year, + extract_day as cpp_extract_day, + extract_hour as cpp_extract_hour, + extract_microsecond_fraction as cpp_extract_microsecond_fraction, + extract_millisecond_fraction as cpp_extract_millisecond_fraction, + extract_minute as cpp_extract_minute, + extract_month as cpp_extract_month, + extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, + extract_second as cpp_extract_second, + extract_weekday as cpp_extract_weekday, + extract_year as cpp_extract_year, +) from .column cimport Column @@ -28,3 +40,42 @@ cpdef Column extract_year( with nogil: result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) + + +def extract_datetime_component(Column col, str field): + + cdef unique_ptr[column] c_result + + with nogil: + if field == "year": + c_result = move(cpp_extract_year(col.view())) + elif field == "month": + c_result = move(cpp_extract_month(col.view())) + elif field == "day": + c_result = move(cpp_extract_day(col.view())) + elif field == "weekday": + c_result = move(cpp_extract_weekday(col.view())) + elif field == "hour": + c_result = move(cpp_extract_hour(col.view())) + elif field == "minute": + c_result = move(cpp_extract_minute(col.view())) + elif field == "second": + c_result = move(cpp_extract_second(col.view())) + elif field == "millisecond": + c_result = move( + cpp_extract_millisecond_fraction(col.view()) + ) + elif field == "microsecond": + c_result = move( + cpp_extract_microsecond_fraction(col.view()) + ) + elif field == "nanosecond": + c_result = move( + cpp_extract_nanosecond_fraction(col.view()) + ) + elif field == "day_of_year": + c_result = move(cpp_day_of_year(col.view())) + else: + raise ValueError(f"Invalid datetime field: '{field}'") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index bd6e2e0af02..abf4357f862 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 3a89299f11a..019ff3f17ba 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,10 +1,10 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - ctypedef enum side_type: + cpdef enum class side_type(int32_t): LEFT 'cudf::strings::side_type::LEFT' RIGHT 'cudf::strings::side_type::RIGHT' BOTH 'cudf::strings::side_type::BOTH' diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index d3065cf8667..8b4fbb1932f 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx - regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx slice.pyx +set(cython_sources + capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx regex_flags.pyx + regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx ) set(linked_libraries cudf::cudf) @@ -22,3 +23,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) + +add_subdirectory(convert) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 6848c8e6e86..4867d944dc7 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,10 +5,13 @@ from . cimport ( case, char_types, contains, + convert, extract, find, regex_flags, regex_program, replace, slice, + strip, ) +from .side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index bba86e818cc..a3bef64d19f 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,6 +5,7 @@ case, char_types, contains, + convert, extract, find, regex_flags, @@ -12,4 +13,6 @@ repeat, replace, slice, + strip, ) +from .side_type import SideType diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt new file mode 100644 index 00000000000..175c9b3738e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources convert_durations.pyx convert_datetime.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd new file mode 100644 index 00000000000..05324cb49df --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py new file mode 100644 index 00000000000..d803399d53c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd new file mode 100644 index 00000000000..07c84d263d6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx new file mode 100644 index 00000000000..fcacb096f87 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_datetime as cpp_convert_datetime, +) + +from pylibcudf.types import DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.to_timestamps( + input.view(), + timestamp_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.from_timestamps( + input.view(), + format, + input_strings_names.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_timestamp( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.is_timestamp( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd new file mode 100644 index 00000000000..ac11b8959ed --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +) + +cpdef Column from_durations( + Column input, + const string& format +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx new file mode 100644 index 00000000000..f3e0b7c9c8e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_durations as cpp_convert_durations, +) + +from pylibcudf.types import DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_durations( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.from_durations( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd new file mode 100644 index 00000000000..34b7a580380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx new file mode 100644 index 00000000000..acdc7d6ff1f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type import \ + side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd new file mode 100644 index 00000000000..8bbe4753edd --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=*, + Scalar to_strip=* +) diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx new file mode 100644 index 00000000000..429a23c3cdf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport strip as cpp_strip +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=side_type.BOTH, + Scalar to_strip=None +): + """Removes the specified characters from the beginning + or end (or both) of each string. + + For details, see :cpp:func:`cudf::strings::strip`. + + Parameters + ---------- + input : Column + Strings column for this operation + side : SideType, default SideType.BOTH + Indicates characters are to be stripped from the beginning, + end, or both of each string; Default is both + to_strip : Scalar + UTF-8 encoded characters to strip from each string; + Default is empty string which indicates strip whitespace characters + + Returns + ------- + pylibcudf.Column + New strings column. + """ + + if to_strip is None: + to_strip = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef unique_ptr[column] c_result + cdef string_scalar* cpp_to_strip + cpp_to_strip = (to_strip.c_obj.get()) + + with nogil: + c_result = cpp_strip.strip( + input.view(), + side, + dereference(cpp_to_strip) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index d3aa6101e2d..89c96829e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime +import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,7 +11,7 @@ @pytest.fixture -def column(has_nulls): +def date_column(has_nulls): values = [ datetime.date(1999, 1, 1), datetime.date(2024, 10, 12), @@ -22,9 +23,41 @@ def column(has_nulls): return plc.interop.from_arrow(pa.array(values, type=pa.date32())) -def test_extract_year(column): - got = plc.datetime.extract_year(column) +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def datetime_column(has_nulls, request): + values = [ + datetime.datetime(1999, 1, 1), + datetime.datetime(2024, 10, 12), + datetime.datetime(1970, 1, 1), + datetime.datetime(2260, 1, 1), + datetime.datetime(2024, 2, 29, 3, 14, 15), + datetime.datetime(2024, 2, 29, 3, 14, 15, 999), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow( + pa.array(values, type=pa.timestamp(request.param)) + ) + + +@pytest.mark.parametrize( + "component, pc_fun", + [ + ("year", pc.year), + ("month", pc.month), + ("day", pc.day), + ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), + ("hour", pc.hour), + ("minute", pc.minute), + ("second", pc.second), + ("millisecond", pc.millisecond), + ("microsecond", pc.microsecond), + ("nanosecond", pc.nanosecond), + ], +) +def test_extraction(datetime_column, component, pc_fun): + got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16()) + expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py new file mode 100644 index 00000000000..e9e95459d0e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + scope="module", + params=[ + pa.timestamp("ns"), + pa.timestamp("us"), + pa.timestamp("ms"), + pa.timestamp("s"), + ], +) +def timestamp_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_timestamp_col(): + return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_timestamp_col(pa_timestamp_col): + return plc.interop.from_arrow(pa_timestamp_col) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +@pytest.mark.parametrize("format", ["%Y-%m-%d"]) +def test_to_datetime( + pa_timestamp_col, plc_timestamp_col, timestamp_type, format +): + expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit) + got = plc.strings.convert.convert_datetime.to_timestamps( + plc_timestamp_col, + plc.interop.from_arrow(timestamp_type), + format.encode(), + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", ["%H:%M:%S"]) +def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format.encode(), + ) + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py new file mode 100644 index 00000000000..005e5e4a405 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + +data_strings = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", +] + + +@pytest.fixture +def pa_col(): + return pa.array(data_strings, type=pa.string()) + + +@pytest.fixture +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) + + +@pytest.fixture(params=strip_chars) +def pa_char(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture +def plc_char(pa_char): + return plc.interop.from_arrow(pa_char) + + +def test_strip(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.strip() + return st.strip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char) + assert_column_eq(expected, got) + + +def test_strip_right(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.rstrip() + return st.rstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip( + plc_col, plc.strings.SideType.RIGHT, plc_char + ) + assert_column_eq(expected, got) + + +def test_strip_left(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.lstrip() + return st.lstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char) + assert_column_eq(expected, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index 06fc35d8835..d5c618f07e4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans): got = input.with_mask(mask, null_count) assert_column_eq(expect, got) + + +def test_bools_to_mask_roundtrip(): + pa_array = pa.array([True, None, False]) + plc_input = plc.interop.from_arrow(pa_array) + mask, result_null_count = plc.transform.bools_to_mask(plc_input) + + assert result_null_count == 2 + result = plc_input.with_mask(mask, result_null_count) + assert_column_eq(pa.array([True, None, None]), result) + + plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array)) + result_pa = plc.interop.to_arrow(plc_output) + expected_pa = pa.chunked_array([[True, False, False]]) + assert result_pa.equals(expected_pa) + + +def test_encode(): + pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]}) + plc_input = plc.interop.from_arrow(pa_table) + result_table, result_column = plc.transform.encode(plc_input) + pa_table_result = plc.interop.to_arrow(result_table) + pa_column_result = plc.interop.to_arrow(result_column) + + pa_table_expected = pa.table( + [[1, 3, 4], [1, 2, 4]], + schema=pa.schema( + [ + pa.field("", pa.int64(), nullable=False), + pa.field("", pa.int64(), nullable=False), + ] + ), + ) + assert pa_table_result.equals(pa_table_expected) + + pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32()) + assert pa_column_result.equals(pa_column_expected) + + +def test_one_hot_encode(): + pa_column = pa.array([1, 2, 3]) + pa_categories = pa.array([0, 0, 0]) + plc_input = plc.interop.from_arrow(pa_column) + plc_categories = plc.interop.from_arrow(pa_categories) + plc_table = plc.transform.one_hot_encode(plc_input, plc_categories) + result = plc.interop.to_arrow(plc_table) + expected = pa.table( + [[False] * 3] * 3, + schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3), + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 4b21feffe25..b530f433c97 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx) + +cpdef tuple[Table, Column] encode(Table input) + +cpdef Table one_hot_encode(Column input_column, Column categories) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 100ccb580ce..bcd6185521a 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,14 +1,20 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from libcpp.utility cimport move, pair from pylibcudf.libcudf cimport transform as cpp_transform -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.libcudf.types cimport bitmask_type, size_type from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .types cimport DataType +from .utils cimport int_to_bitmask_ptr cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): @@ -32,3 +38,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), c_result.second ) + + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): + """Create a bitmask from a column of boolean elements + + Parameters + ---------- + input : Column + Column to produce new mask from. + + Returns + ------- + tuple[gpumemoryview, int] + Two-tuple of a gpumemoryview wrapping the bitmask and the null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.bools_to_mask(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) + + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): + """Creates a boolean column from given bitmask. + + Parameters + ---------- + bitmask : int + Pointer to the bitmask which needs to be converted + begin_bit : int + Position of the bit from which the conversion should start + end_bit : int + Position of the bit before which the conversion should stop + + Returns + ------- + Column + Boolean column of the bitmask from [begin_bit, end_bit] + """ + cdef unique_ptr[column] c_result + cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) + + with nogil: + c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx): + """Create a new column by applying a unary function against every + element of an input column. + + Parameters + ---------- + input : Column + Column to transform. + unary_udf : str + The PTX/CUDA string of the unary function to apply. + output_type : DataType + The output type that is compatible with the output type in the unary_udf. + is_ptx : bool + If `True`, the UDF is treated as PTX code. + If `False`, the UDF is treated as CUDA code. + + Returns + ------- + Column + The transformed column having the UDF applied to each element. + """ + cdef unique_ptr[column] c_result + cdef string c_unary_udf = unary_udf.encode() + cdef bool c_is_ptx = is_ptx + + with nogil: + c_result = move( + cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef tuple[Table, Column] encode(Table input): + """Encode the rows of the given table as integers. + + Parameters + ---------- + input : Table + Table containing values to be encoded + + Returns + ------- + tuple[Table, Column] + The distinct row of the input table in sorted order, + and a column of integer indices representing the encoded rows. + """ + cdef pair[unique_ptr[table], unique_ptr[column]] c_result + + with nogil: + c_result = move(cpp_transform.encode(input.view())) + + return ( + Table.from_libcudf(move(c_result.first)), + Column.from_libcudf(move(c_result.second)) + ) + +cpdef Table one_hot_encode(Column input, Column categories): + """Encodes `input` by generating a new column + for each value in `categories` indicating the presence + of that value in `input`. + + Parameters + ---------- + input : Column + Column containing values to be encoded. + categories : Column + Column containing categories + + Returns + ------- + Column + A table of the encoded values. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table)