From 22cefc94d05727607563d6519eb17e1eb95c5478 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 24 Sep 2024 21:40:11 -0500 Subject: [PATCH 1/6] Fix metadata after implicit array conversion from Dask cuDF (#16842) Temporary workaround for https://github.com/dask/dask/issues/11017 in Dask cuDF (when query-planning is enabled). I will try to move this fix upstream soon. However, the next dask release will probably not be used by 24.10, and it's still unclear whether the same fix works for all CPU cases. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16842 --- .../dask_cudf/dask_cudf/expr/_collection.py | 79 +++++++++++++------ python/dask_cudf/dask_cudf/tests/test_core.py | 17 ++-- 2 files changed, 65 insertions(+), 31 deletions(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index 97e1dffc65b..c1dd16eac8d 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -202,27 +202,58 @@ class Index(DXIndex, CudfFrameBase): ## -try: - from dask_expr._backends import create_array_collection - - @get_collection_type.register_lazy("cupy") - def _register_cupy(): - import cupy - - @get_collection_type.register(cupy.ndarray) - def get_collection_type_cupy_array(_): - return create_array_collection - - @get_collection_type.register_lazy("cupyx") - def _register_cupyx(): - # Needed for cuml - from cupyx.scipy.sparse import spmatrix - - @get_collection_type.register(spmatrix) - def get_collection_type_csr_matrix(_): - return create_array_collection - -except ImportError: - # Older version of dask-expr. - # Implicit conversion to array wont work. - pass +def _create_array_collection_with_meta(expr): + # NOTE: This is the GPU compatible version of + # `new_dd_object` for DataFrame -> Array conversion. + # This can be removed if dask#11017 is resolved + # (See: https://github.com/dask/dask/issues/11017) + import numpy as np + + import dask.array as da + from dask.blockwise import Blockwise + from dask.highlevelgraph import HighLevelGraph + + result = expr.optimize() + dsk = result.__dask_graph__() + name = result._name + meta = result._meta + divisions = result.divisions + chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( + (d,) for d in meta.shape[1:] + ) + if len(chunks) > 1: + if isinstance(dsk, HighLevelGraph): + layer = dsk.layers[name] + else: + # dask-expr provides a dict only + layer = dsk + if isinstance(layer, Blockwise): + layer.new_axes["j"] = chunks[1][0] + layer.output_indices = layer.output_indices + ("j",) + else: + suffix = (0,) * (len(chunks) - 1) + for i in range(len(chunks[0])): + layer[(name, i) + suffix] = layer.pop((name, i)) + + return da.Array(dsk, name=name, chunks=chunks, meta=meta) + + +@get_collection_type.register_lazy("cupy") +def _register_cupy(): + import cupy + + get_collection_type.register( + cupy.ndarray, + lambda _: _create_array_collection_with_meta, + ) + + +@get_collection_type.register_lazy("cupyx") +def _register_cupyx(): + # Needed for cuml + from cupyx.scipy.sparse import spmatrix + + get_collection_type.register( + spmatrix, + lambda _: _create_array_collection_with_meta, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7aa0f6320f2..9f54aba3e13 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -16,6 +16,7 @@ import dask_cudf from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, require_dask_expr, skip_dask_expr, xfail_dask_expr, @@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy(): def func(x): return x.values - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 - result = ds.map_partitions(func, meta=s.values).compute() - expect = func(s) + result = ds.map_partitions(func, meta=s.values) - dask.array.assert_eq(result, expect) + if QUERY_PLANNING_ON: + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) + else: + # Legacy version still carries numpy metadata + # See: https://github.com/dask/dask/issues/11017 + dask.array.assert_eq(result.compute(), func(s)) def test_implicit_array_conversion_cupy_sparse(): @@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse(): def func(x): return cupyx.scipy.sparse.csr_matrix(x.values) - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 result = ds.map_partitions(func, meta=s.values).compute() expect = func(s) From 9316309551d13bd258d7cb359cde0cc96019e0cd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 24 Sep 2024 19:40:46 -0700 Subject: [PATCH 2/6] Remove unnecessary flag from build.sh (#16879) This CMake option was removed by #15483. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/16879 --- build.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/build.sh b/build.sh index 211e1db9fbf..69d6481af42 100755 --- a/build.sh +++ b/build.sh @@ -239,11 +239,6 @@ if hasArg --pydevelop; then PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e" fi -# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. -if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" -fi - if hasArg --disable_large_strings; then BUILD_DISABLE_LARGE_STRINGS="ON" fi From 03c77c2176ee5f30ef3d10b9332ad9c3612db905 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 24 Sep 2024 18:02:05 -1000 Subject: [PATCH 3/6] Add string.findall APIs to pylibcudf (#16825) Contributes to https://github.com/rapidsai/cudf/issues/15162 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/16825 --- .../api_docs/pylibcudf/strings/findall.rst | 6 +++ .../api_docs/pylibcudf/strings/index.rst | 1 + python/cudf/cudf/_lib/strings/findall.pyx | 35 +++++----------- .../pylibcudf/libcudf/strings/findall.pxd | 4 +- .../pylibcudf/strings/CMakeLists.txt | 4 +- .../pylibcudf/pylibcudf/strings/__init__.pxd | 1 + .../pylibcudf/pylibcudf/strings/__init__.py | 1 + .../pylibcudf/pylibcudf/strings/findall.pxd | 7 ++++ .../pylibcudf/pylibcudf/strings/findall.pyx | 40 +++++++++++++++++++ .../pylibcudf/tests/test_string_findall.py | 23 +++++++++++ 10 files changed, 93 insertions(+), 29 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pxd create mode 100644 python/pylibcudf/pylibcudf/strings/findall.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_string_findall.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst new file mode 100644 index 00000000000..9850ee10098 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -0,0 +1,6 @@ +==== +find +==== + +.. automodule:: pylibcudf.strings.findall + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 003e7c0c35e..9b1a6b72a88 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -9,6 +9,7 @@ strings contains extract find + findall regex_flags regex_program repeat diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 3cf2084e30a..0e758d5b322 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,21 +1,13 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def findall(Column source_strings, object pattern, uint32_t flags): @@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags): Returns data with all non-overlapping matches of `pattern` in each string of `source_strings` as a lists column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_findall( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.findall( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index b25724586e1..e0a8b776465 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 8b4fbb1932f..77f20b0b917 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx regex_flags.pyx - regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx + capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 4867d944dc7..91d884b294b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -8,6 +8,7 @@ from . cimport ( convert, extract, find, + findall, regex_flags, regex_program, replace, diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index a3bef64d19f..b4856784390 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -8,6 +8,7 @@ convert, extract, find, + findall, regex_flags, regex_program, repeat, diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd new file mode 100644 index 00000000000..54afa088141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx new file mode 100644 index 00000000000..03ecb13a50e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport findall as cpp_findall +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern): + """ + Returns a lists column of strings for each matching occurrence using + the regex_program pattern within each string. + + For details, see For details, see :cpp:func:`cudf::strings::findall`. + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New lists column of strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py new file mode 100644 index 00000000000..994552fa276 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import re + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_findall(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[ab]" + result = plc.strings.findall.findall( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [re.findall(pattern, elem) for elem in arr.to_pylist()], + type=pa_result.type, + ) + assert_column_eq(result, expected) From dbe5528706b309a9a21f34e948c22c1c4de9caff Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 25 Sep 2024 09:14:01 -0400 Subject: [PATCH 4/6] [FEA] Add an environment variable to fail on fallback in `cudf.pandas` (#16562) This PR makes more on #14975 by adding an environment variable that fails when fallback occurs in cudf.pandas. It also adds some tests that do __not__ fallback. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16562 --- python/cudf/cudf/pandas/fast_slow_proxy.py | 10 ++ .../cudf_pandas_tests/test_cudf_pandas.py | 16 ++- .../test_cudf_pandas_no_fallback.py | 100 ++++++++++++++++++ 3 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index bf2ee6ae624..0c1cda8810b 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,12 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +class ProxyFallbackError(Exception): + """Raised when fallback occurs""" + + pass + + def _fast_function_call(): """ Placeholder fast function for pytest profiling purposes. @@ -957,6 +963,10 @@ def _fast_slow_function_call( f"The exception was {e}." ) except Exception as err: + if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): + raise ProxyFallbackError( + f"The operation failed with cuDF, the reason was {type(err)}: {err}." + ) from err with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index c4ab4b0a853..2bbed40e34e 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -26,7 +26,11 @@ from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object +from cudf.pandas.fast_slow_proxy import ( + ProxyFallbackError, + _Unusable, + is_proxy_object, +) from cudf.testing import assert_eq if not LOADED: @@ -1738,3 +1742,13 @@ def add_one_ufunc(a): return a + 1 assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2))) + + +@pytest.mark.xfail( + reason="Fallback expected because casting to object is not supported", +) +def test_fallback_raises_error(monkeypatch): + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + with pytest.raises(ProxyFallbackError): + pd.Series(range(2)).astype(object) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py new file mode 100644 index 00000000000..896256bf6d7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cudf.pandas import LOADED + +if not LOADED: + raise ImportError("These tests must be run with cudf.pandas loaded") + +import numpy as np +import pandas as pd + + +@pytest.fixture(autouse=True) +def fail_on_fallback(monkeypatch): + monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + + +@pytest.fixture +def dataframe(): + df = pd.DataFrame( + { + "a": [1, 1, 1, 2, 3], + "b": [1, 2, 3, 4, 5], + "c": [1.2, 1.3, 1.5, 1.7, 1.11], + } + ) + return df + + +@pytest.fixture +def series(dataframe): + return dataframe["a"] + + +@pytest.fixture +def array(series): + return series.values + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "min", + "max", + "mean", + "std", + "var", + "prod", + "median", + ], +) +def test_no_fallback_in_reduction_ops(series, op): + s = series + getattr(s, op)() + + +def test_groupby(dataframe): + df = dataframe + df.groupby("a", sort=True).max() + + +def test_no_fallback_in_binops(dataframe): + df = dataframe + df + df + df - df + df * df + df**df + df[["a", "b"]] & df[["a", "b"]] + df <= df + + +def test_no_fallback_in_groupby_rolling_sum(dataframe): + df = dataframe + df.groupby("a").rolling(2).sum() + + +def test_no_fallback_in_concat(dataframe): + df = dataframe + pd.concat([df, df]) + + +def test_no_fallback_in_get_shape(dataframe): + df = dataframe + df.shape + + +def test_no_fallback_in_array_ufunc_op(array): + np.add(array, array) + + +def test_no_fallback_in_merge(dataframe): + df = dataframe + pd.merge(df * df, df + df, how="inner") + pd.merge(df * df, df + df, how="outer") + pd.merge(df * df, df + df, how="left") + pd.merge(df * df, df + df, how="right") From 75c5c83f1375213c94527eba1d0488145d7fdce7 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 25 Sep 2024 09:12:32 -0500 Subject: [PATCH 5/6] Add dask-cudf workaround for missing `rename_axis` support in cudf (#16899) See https://github.com/rapidsai/cudf/issues/16895 Closes https://github.com/rapidsai/cudf/issues/16892 Dask-expr uses `rename_axis`, which is not supported by cudf yet. This is a temporary workaround until #16895 is resolved. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Mads R. B. Kristensen (https://github.com/madsbk) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16899 --- python/dask_cudf/dask_cudf/expr/_collection.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/expr/_expr.py | 16 +++++++++++++++- python/dask_cudf/dask_cudf/tests/test_core.py | 12 ++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index c1dd16eac8d..907abaa2bfc 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -15,6 +15,7 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.typing import no_default import cudf @@ -90,6 +91,17 @@ def var( ) ) + def rename_axis( + self, mapper=no_default, index=no_default, columns=no_default, axis=0 + ): + from dask_cudf.expr._expr import RenameAxisCudf + + return new_collection( + RenameAxisCudf( + self, mapper=mapper, index=index, columns=columns, axis=axis + ) + ) + class DataFrame(DXDataFrame, CudfFrameBase): @classmethod diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index 8a2c50d3fe7..b284ab3774d 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -4,11 +4,12 @@ import dask_expr._shuffle as _shuffle_module from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise -from dask_expr._expr import Elemwise, Expr, VarColumns +from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype +from dask.typing import no_default import cudf @@ -17,6 +18,19 @@ ## +class RenameAxisCudf(RenameAxis): + # TODO: Remove this after rename_axis is supported in cudf + # (See: https://github.com/rapidsai/cudf/issues/16895) + @staticmethod + def operation(df, index=no_default, **kwargs): + if index != no_default: + df.index.name = index + return df + raise NotImplementedError( + "Only `index` is supported for the cudf backend" + ) + + class ToCudfBackend(Elemwise): # TODO: Inherit from ToBackend when rapids-dask-dependency # is pinned to dask>=2024.8.1 diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 9f54aba3e13..5f0fae86691 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1027,3 +1027,15 @@ def test_cov_corr(op, numeric_only): # (See: https://github.com/rapidsai/cudf/issues/12626) expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) dd.assert_eq(res, expect) + + +def test_rename_axis_after_join(): + df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3])) + df1.index.name = "test" + ddf1 = dd.from_pandas(df1, 2) + + df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3])) + ddf2 = dd.from_pandas(df2, 2) + result = ddf1.join(ddf2, how="outer") + expected = df1.join(df2, how="outer") + dd.assert_eq(result, expected, check_index=False) From 416042314e16c1bfc7499309ccc4a352b1f47c0a Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 25 Sep 2024 15:42:51 +0100 Subject: [PATCH 6/6] Pin polars for 24.10 and update polars test suite xfail list (#16886) For releases, since the polars release cadence is quite a lot faster than rapids, we propose to hard-pin to a known good version. In this case, 1.8.x. At the same time, remove pin in CI scripts and update list of xfailing tests in the polars test suite. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - James Lamb (https://github.com/jameslamb) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16886 --- ci/run_cudf_polars_polars_tests.sh | 2 +- ci/test_cudf_polars_polars_tests.sh | 3 +- ci/test_wheel_cudf_polars.sh | 5 +-- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/__init__.py | 8 +++-- .../cudf_polars/cudf_polars/dsl/translate.py | 8 ----- .../cudf_polars/testing/asserts.py | 14 +++++--- .../cudf_polars/cudf_polars/testing/plugin.py | 4 +++ .../cudf_polars/cudf_polars/utils/versions.py | 16 ++++----- python/cudf_polars/pyproject.toml | 2 +- python/cudf_polars/tests/test_groupby.py | 6 +++- .../cudf_polars/tests/testing/test_asserts.py | 35 ++++++++----------- 12 files changed, 51 insertions(+), 54 deletions(-) diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh index 52a827af94c..95f78f17f2f 100755 --- a/ci/run_cudf_polars_polars_tests.sh +++ b/ci/run_cudf_polars_polars_tests.sh @@ -21,7 +21,7 @@ python -m pytest \ -m "" \ -p cudf_polars.testing.plugin \ -v \ - --tb=short \ + --tb=native \ ${DESELECTED_TESTS} \ "$@" \ py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 6c728a9537f..bfc8fd37565 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl rapids-logger "Install cudf_polars" python -m pip install $(echo ./dist/cudf_polars*.whl) -# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') -TAG="py-1.7.0" +TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index b4509bba02e..3116bd820e9 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -39,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then | tee ./constraints.txt fi -# echo to expand wildcard before adding `[extra]` requires for pip +# echo to expand wildcard before adding `[test]` requires for pip python -m pip install \ -v \ --constraint ./constraints.txt \ @@ -47,9 +47,6 @@ python -m pip install \ "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Pin to 1.7.0 Temporarily" -python -m pip install polars==1.7.0 - rapids-logger "Run cudf_polars tests" function set_exitcode() diff --git a/dependencies.yaml b/dependencies.yaml index 7a9c9b8486d..339adbc5ff9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -663,7 +663,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.6 + - polars>=1.8,<1.9 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index c1317e8f467..66c15f694ee 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -10,13 +10,15 @@ from __future__ import annotations -# Check we have a supported polars version -import cudf_polars.utils.versions as v from cudf_polars._version import __git_commit__, __version__ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir -del v +# Check we have a supported polars version +from cudf_polars.utils.versions import _ensure_polars_version + +_ensure_polars_version() +del _ensure_polars_version __all__: list[str] = [ "execute_with_cudf", diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 45881afe0c8..a0291037f01 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -93,14 +93,6 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) - if ( - typ == "csv" - and visitor.version()[0] == 1 - and reader_options["schema"] is not None - ): - reader_options["schema"] = { - "fields": reader_options["schema"]["inner"] - } # pragma: no cover; CI tests 1.7 file_options = node.file_options with_columns = file_options.with_columns n_rows = file_options.n_rows diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index a79d45899cd..7b6f3848fc4 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -164,9 +164,11 @@ def assert_collect_raises( cudf-polars. Useful for controlling optimization settings. polars_except - Exception or exceptions polars CPU is expected to raise. + Exception or exceptions polars CPU is expected to raise. If + None, CPU is not expected to raise an exception. cudf_except - Exception or exceptions polars GPU is expected to raise. + Exception or exceptions polars GPU is expected to raise. If + None, GPU is not expected to raise an exception. collect_kwargs Common keyword arguments to pass to collect for both polars CPU and cudf-polars. @@ -203,7 +205,8 @@ def assert_collect_raises( f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" ) from e else: - raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + if polars_except != (): + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") engine = GPUEngine(raise_on_fail=True) try: @@ -212,7 +215,8 @@ def assert_collect_raises( pass except Exception as e: raise AssertionError( - f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}" + f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}" ) from e else: - raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}") + if cudf_except != (): + raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index c40d59e6d33..05b76d76808 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config): "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 2e6efde968c..4a7ad6b3cf2 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,11 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6") -POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6") -POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6") - -if POLARS_VERSION_LT_16: - raise ImportError( - "cudf_polars requires py-polars v1.6 or greater." - ) # pragma: no cover +POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") + + +def _ensure_polars_version(): + if POLARS_VERSION_LT_18: + raise ImportError( + "cudf_polars requires py-polars v1.8 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 857a8c14b2f..268364f72a7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.6", + "polars>=1.8,<1.9", "pylibcudf==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 6f996e0e0ec..74bf8b9e4e2 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -168,7 +168,11 @@ def test_groupby_nan_minmax_raises(op): "expr", [ pl.lit(1).alias("value"), - pl.lit([[4, 5, 6]]).alias("value"), + pytest.param( + pl.lit([[4, 5, 6]]).alias("value"), + marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"), + ), + pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), pl.col("float") * (1 - pl.col("int")), [pl.lit(2).alias("value"), pl.col("float") * 2], ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 8e7f1a09d9b..ace1c6b8648 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -7,8 +7,6 @@ import polars as pl -from cudf_polars.containers import DataFrame -from cudf_polars.dsl.ir import Select from cudf_polars.testing.asserts import ( assert_collect_raises, assert_gpu_result_equal, @@ -38,14 +36,24 @@ class E(Exception): assert_ir_translation_raises(unsupported, E) -def test_collect_assert_raises(monkeypatch): +def test_collect_assert_raises(): df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - with pytest.raises(AssertionError): - # This should raise, because polars CPU can run this query + with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"): + # This should raise, because polars CPU can run this query, + # but we expect an error. assert_collect_raises( df, polars_except=pl.exceptions.InvalidOperationError, + cudf_except=(), + ) + + with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"): + # This should raise, because polars GPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=(), cudf_except=pl.exceptions.InvalidOperationError, ) @@ -60,7 +68,7 @@ def test_collect_assert_raises(monkeypatch): cudf_except=pl.exceptions.InvalidOperationError, ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError, match="GPU execution RAISED"): # This should raise because the expected GPU error is wrong assert_collect_raises( q, @@ -68,23 +76,10 @@ def test_collect_assert_raises(monkeypatch): cudf_except=NotImplementedError, ) - with pytest.raises(AssertionError): + with pytest.raises(AssertionError, match="CPU execution RAISED"): # This should raise because the expected CPU error is wrong assert_collect_raises( q, polars_except=NotImplementedError, cudf_except=pl.exceptions.InvalidOperationError, ) - - with monkeypatch.context() as m: - m.setattr(Select, "evaluate", lambda self, cache: DataFrame([])) - # This query should fail, but we monkeypatch a bad - # implementation of Select which "succeeds" to check that our - # assertion notices this case. - q = df.select(pl.col("a") + pl.Series([1, 2])) - with pytest.raises(AssertionError): - assert_collect_raises( - q, - polars_except=pl.exceptions.ComputeError, - cudf_except=pl.exceptions.ComputeError, - )