Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into cudf-polars-build-script
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Sep 25, 2024
2 parents c62b970 + 4160423 commit 5813e93
Show file tree
Hide file tree
Showing 29 changed files with 373 additions and 121 deletions.
5 changes: 0 additions & 5 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,6 @@ if hasArg --pydevelop; then
PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
fi

# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
fi

if hasArg --disable_large_strings; then
BUILD_DISABLE_LARGE_STRINGS="ON"
fi
Expand Down
2 changes: 1 addition & 1 deletion ci/run_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ python -m pytest \
-m "" \
-p cudf_polars.testing.plugin \
-v \
--tb=short \
--tb=native \
${DESELECTED_TESTS} \
"$@" \
py-polars/tests
3 changes: 1 addition & 2 deletions ci/test_cudf_polars_polars_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
rapids-logger "Install cudf_polars"
python -m pip install $(echo ./dist/cudf_polars*.whl)

# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
TAG="py-1.7.0"
TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
rapids-logger "Clone polars to ${TAG}"
git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1

Expand Down
5 changes: 1 addition & 4 deletions ci/test_wheel_cudf_polars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,14 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
| tee ./constraints.txt
fi

# echo to expand wildcard before adding `[extra]` requires for pip
# echo to expand wildcard before adding `[test]` requires for pip
python -m pip install \
-v \
--constraint ./constraints.txt \
"$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
"$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
"$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"

rapids-logger "Pin to 1.7.0 Temporarily"
python -m pip install polars==1.7.0

rapids-logger "Run cudf_polars tests"

function set_exitcode()
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=1.6
- polars>=1.8,<1.9
run_dask_cudf:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
find
====

.. automodule:: pylibcudf.strings.findall
:members:
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ strings
contains
extract
find
findall
regex_flags
regex_program
repeat
Expand Down
35 changes: 10 additions & 25 deletions python/cudf/cudf/_lib/strings/findall.pyx
Original file line number Diff line number Diff line change
@@ -1,40 +1,25 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.column cimport Column

import pylibcudf as plc


@acquire_spill_lock()
def findall(Column source_strings, object pattern, uint32_t flags):
"""
Returns data with all non-overlapping matches of `pattern`
in each string of `source_strings` as a lists column.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string pattern_string = <string>str(pattern).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(pattern_string, c_flags))
c_result = move(cpp_findall(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = plc.strings.regex_program.RegexProgram.create(
str(pattern), flags
)
plc_result = plc.strings.findall.findall(
source_strings.to_pylibcudf(mode="read"),
prog,
)
return Column.from_pylibcudf(plc_result)
10 changes: 10 additions & 0 deletions python/cudf/cudf/pandas/fast_slow_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,12 @@ def _assert_fast_slow_eq(left, right):
assert_eq(left, right)


class ProxyFallbackError(Exception):
"""Raised when fallback occurs"""

pass


def _fast_function_call():
"""
Placeholder fast function for pytest profiling purposes.
Expand Down Expand Up @@ -957,6 +963,10 @@ def _fast_slow_function_call(
f"The exception was {e}."
)
except Exception as err:
if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False):
raise ProxyFallbackError(
f"The operation failed with cuDF, the reason was {type(err)}: {err}."
) from err
with nvtx.annotate(
"EXECUTE_SLOW",
color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
Expand Down
16 changes: 15 additions & 1 deletion python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@

from cudf.core._compat import PANDAS_GE_220
from cudf.pandas import LOADED, Profiler
from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
from cudf.pandas.fast_slow_proxy import (
ProxyFallbackError,
_Unusable,
is_proxy_object,
)
from cudf.testing import assert_eq

if not LOADED:
Expand Down Expand Up @@ -1738,3 +1742,13 @@ def add_one_ufunc(a):
return a + 1

assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2)))


@pytest.mark.xfail(
reason="Fallback expected because casting to object is not supported",
)
def test_fallback_raises_error(monkeypatch):
with monkeypatch.context() as monkeycontext:
monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")
with pytest.raises(ProxyFallbackError):
pd.Series(range(2)).astype(object)
100 changes: 100 additions & 0 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import pytest

from cudf.pandas import LOADED

if not LOADED:
raise ImportError("These tests must be run with cudf.pandas loaded")

import numpy as np
import pandas as pd


@pytest.fixture(autouse=True)
def fail_on_fallback(monkeypatch):
monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True")


@pytest.fixture
def dataframe():
df = pd.DataFrame(
{
"a": [1, 1, 1, 2, 3],
"b": [1, 2, 3, 4, 5],
"c": [1.2, 1.3, 1.5, 1.7, 1.11],
}
)
return df


@pytest.fixture
def series(dataframe):
return dataframe["a"]


@pytest.fixture
def array(series):
return series.values


@pytest.mark.parametrize(
"op",
[
"sum",
"min",
"max",
"mean",
"std",
"var",
"prod",
"median",
],
)
def test_no_fallback_in_reduction_ops(series, op):
s = series
getattr(s, op)()


def test_groupby(dataframe):
df = dataframe
df.groupby("a", sort=True).max()


def test_no_fallback_in_binops(dataframe):
df = dataframe
df + df
df - df
df * df
df**df
df[["a", "b"]] & df[["a", "b"]]
df <= df


def test_no_fallback_in_groupby_rolling_sum(dataframe):
df = dataframe
df.groupby("a").rolling(2).sum()


def test_no_fallback_in_concat(dataframe):
df = dataframe
pd.concat([df, df])


def test_no_fallback_in_get_shape(dataframe):
df = dataframe
df.shape


def test_no_fallback_in_array_ufunc_op(array):
np.add(array, array)


def test_no_fallback_in_merge(dataframe):
df = dataframe
pd.merge(df * df, df + df, how="inner")
pd.merge(df * df, df + df, how="outer")
pd.merge(df * df, df + df, how="left")
pd.merge(df * df, df + df, how="right")
8 changes: 5 additions & 3 deletions python/cudf_polars/cudf_polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@

from __future__ import annotations

# Check we have a supported polars version
import cudf_polars.utils.versions as v
from cudf_polars._version import __git_commit__, __version__
from cudf_polars.callback import execute_with_cudf
from cudf_polars.dsl.translate import translate_ir

del v
# Check we have a supported polars version
from cudf_polars.utils.versions import _ensure_polars_version

_ensure_polars_version()
del _ensure_polars_version

__all__: list[str] = [
"execute_with_cudf",
Expand Down
8 changes: 0 additions & 8 deletions python/cudf_polars/cudf_polars/dsl/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,6 @@ def _(
cloud_options = None
else:
reader_options, cloud_options = map(json.loads, options)
if (
typ == "csv"
and visitor.version()[0] == 1
and reader_options["schema"] is not None
):
reader_options["schema"] = {
"fields": reader_options["schema"]["inner"]
} # pragma: no cover; CI tests 1.7
file_options = node.file_options
with_columns = file_options.with_columns
n_rows = file_options.n_rows
Expand Down
14 changes: 9 additions & 5 deletions python/cudf_polars/cudf_polars/testing/asserts.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,11 @@ def assert_collect_raises(
cudf-polars.
Useful for controlling optimization settings.
polars_except
Exception or exceptions polars CPU is expected to raise.
Exception or exceptions polars CPU is expected to raise. If
None, CPU is not expected to raise an exception.
cudf_except
Exception or exceptions polars GPU is expected to raise.
Exception or exceptions polars GPU is expected to raise. If
None, GPU is not expected to raise an exception.
collect_kwargs
Common keyword arguments to pass to collect for both polars CPU and
cudf-polars.
Expand Down Expand Up @@ -203,7 +205,8 @@ def assert_collect_raises(
f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}"
) from e
else:
raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
if polars_except != ():
raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")

engine = GPUEngine(raise_on_fail=True)
try:
Expand All @@ -212,7 +215,8 @@ def assert_collect_raises(
pass
except Exception as e:
raise AssertionError(
f"GPU execution RAISED {type(e)}, EXPECTED {polars_except}"
f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}"
) from e
else:
raise AssertionError(f"GPU execution DID NOT RAISE {polars_except}")
if cudf_except != ():
raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}")
4 changes: 4 additions & 0 deletions python/cudf_polars/cudf_polars/testing/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ def pytest_configure(config: pytest.Config):
"tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
"tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed",
"tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read",
"tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing",
"tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
"tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
"tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
"tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
"tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
"tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
"tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
"tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
"tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
Expand Down
16 changes: 8 additions & 8 deletions python/cudf_polars/cudf_polars/utils/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

POLARS_VERSION = parse(__version__)

POLARS_VERSION_GE_16 = POLARS_VERSION >= parse("1.6")
POLARS_VERSION_GT_16 = POLARS_VERSION > parse("1.6")
POLARS_VERSION_LT_16 = POLARS_VERSION < parse("1.6")

if POLARS_VERSION_LT_16:
raise ImportError(
"cudf_polars requires py-polars v1.6 or greater."
) # pragma: no cover
POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8")


def _ensure_polars_version():
if POLARS_VERSION_LT_18:
raise ImportError(
"cudf_polars requires py-polars v1.8 or greater."
) # pragma: no cover
2 changes: 1 addition & 1 deletion python/cudf_polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ authors = [
license = { text = "Apache 2.0" }
requires-python = ">=3.10"
dependencies = [
"polars>=1.6",
"polars>=1.8,<1.9",
"pylibcudf==24.10.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
classifiers = [
Expand Down
Loading

0 comments on commit 5813e93

Please sign in to comment.