From d10b8e4c9b437377cb6d231873e8f0fe9f8dc817 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 17 May 2024 11:55:21 -0500 Subject: [PATCH] Handle mixed-like homogeneous types in `isin` (#15771) Fixes: #15768 There is a possibility that a host array can have `object` type but contain all values of a homogeneous type, this still cannot be supported by column constructors because `cudf` doesn't have a true `object` types, hence this PR introduces a workaround for this problem in `isin` API. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15771 --- python/cudf/cudf/core/column/numerical.py | 29 ++++++++++++++++--- python/dask_cudf/dask_cudf/tests/test_core.py | 22 ++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 12c27ed0bc1..bab862f775f 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -38,6 +38,7 @@ ) from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand +from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( min_column_type, min_signed_type, @@ -404,10 +405,30 @@ def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: lhs = cast("cudf.core.column.ColumnBase", self) - rhs = as_column(values, nan_as_null=False) - - if isinstance(rhs, NumericalColumn): - rhs = rhs.astype(dtype=self.dtype) + try: + rhs = as_column(values, nan_as_null=False) + except (MixedTypeError, TypeError) as e: + # There is a corner where `values` can be of `object` dtype + # but have values of homogeneous type. + inferred_dtype = cudf.api.types.infer_dtype(values) + if ( + self.dtype.kind in {"i", "u"} and inferred_dtype == "integer" + ) or ( + self.dtype.kind == "f" + and inferred_dtype in {"floating", "integer"} + ): + rhs = as_column(values, nan_as_null=False, dtype=self.dtype) + elif self.dtype.kind == "f" and inferred_dtype == "integer": + rhs = as_column(values, nan_as_null=False, dtype="int") + elif ( + self.dtype.kind in {"i", "u"} and inferred_dtype == "floating" + ): + rhs = as_column(values, nan_as_null=False, dtype="float") + else: + raise e + else: + if isinstance(rhs, NumericalColumn): + rhs = rhs.astype(dtype=self.dtype) if lhs.null_count == len(lhs): lhs = lhs.astype(rhs.dtype) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 981c2c369f1..18a9e3b496f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -971,3 +971,25 @@ def func(x): # NOTE: The calculation here doesn't need to make sense. # We just need to make sure we get the right type back. assert type(result) == type(expect) + + +@pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]]) +@pytest.mark.parametrize("values", [[1, 5], [1.1, 2.4, 2.3]]) +def test_series_isin(data, values): + ser = cudf.Series(data) + pddf = dd.from_pandas(ser.to_pandas(), 1) + ddf = dask_cudf.from_cudf(ser, 1) + + actual = ddf.isin(values) + expected = pddf.isin(values) + + dd.assert_eq(actual, expected) + + +def test_series_isin_error(): + ser = cudf.Series([1, 2, 3]) + ddf = dask_cudf.from_cudf(ser, 1) + with pytest.raises(TypeError): + ser.isin([1, 5, "a"]) + with pytest.raises(TypeError): + ddf.isin([1, 5, "a"]).compute()