Skip to content

Commit

Permalink
Clean up special casing in as_column for non-typed input (#15276)
Browse files Browse the repository at this point in the history
Redo at #14636

Clean up special casing for non-typed inputs to essentially do:

```
try:
     arbitrary = pa.array(arbitrary)
except:
     arbitrary = pd.Series(arbitrary)
return as_column(arbitrary)
```

Additionally, this change matches a behavior with pandas that will parse string data with `dtype=datetime64` type similar to the 2.2 behavior (fail if the resolution of the type doesn't match the string data)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15276
  • Loading branch information
mroeschke authored Apr 17, 2024
1 parent 041eaa4 commit e928c4a
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 230 deletions.
246 changes: 83 additions & 163 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import builtins
import pickle
import warnings
from collections import abc
from functools import cached_property
from itertools import chain
Expand Down Expand Up @@ -56,7 +55,6 @@
_is_pandas_nullable_extension_dtype,
infer_dtype,
is_bool_dtype,
is_datetime64_dtype,
is_dtype_equal,
is_integer_dtype,
is_scalar,
Expand All @@ -82,12 +80,13 @@
from cudf.utils.dtypes import (
_maybe_convert_to_default_type,
cudf_dtype_from_pa_type,
cudf_dtype_to_pa_type,
find_common_type,
get_time_unit,
is_column_like,
is_mixed_with_object_dtype,
min_scalar_type,
min_unsigned_type,
np_to_pa_dtype,
)
from cudf.utils.utils import _array_ufunc, mask_dtype

Expand Down Expand Up @@ -1923,7 +1922,7 @@ def as_column(
# pandas arrays define __arrow_array__ for better
# pyarrow.array conversion
arbitrary = arbitrary.array
data = as_column(
return as_column(
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
Expand All @@ -1932,7 +1931,7 @@ def as_column(
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
):
data = as_column(
return as_column(
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
Expand All @@ -1956,7 +1955,7 @@ def as_column(
arbitrary = np.asarray(arbitrary)
else:
arbitrary = cupy.asarray(arbitrary)
data = as_column(
return as_column(
arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
)
elif arbitrary.dtype.kind == "O":
Expand Down Expand Up @@ -1988,7 +1987,7 @@ def as_column(
arbitrary,
from_pandas=True,
)
data = as_column(
return as_column(
pyarrow_array,
dtype=dtype,
nan_as_null=nan_as_null,
Expand All @@ -1999,9 +1998,6 @@ def as_column(
f"{type(arbitrary).__name__} with "
f"{type(arbitrary.dtype).__name__} is not supported."
)
if dtype is not None:
data = data.astype(dtype)

elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
if length is None:
length = 1
Expand Down Expand Up @@ -2094,6 +2090,13 @@ def as_column(
return as_column(
np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
)
elif hasattr(arbitrary, "__array__"):
# e.g. test_cuda_array_interface_pytorch
try:
arbitrary = cupy.asarray(arbitrary)
except (ValueError, TypeError):
arbitrary = np.asarray(arbitrary)
return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
# Start of arbitrary that's not handed above but dtype provided
elif isinstance(dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
Expand Down Expand Up @@ -2126,9 +2129,20 @@ def as_column(
pd.IntervalDtype,
cudf.IntervalDtype,
),
) or dtype in {"category", "interval", "str", str, np.str_}:
) or dtype in {
"category",
"interval",
"str",
str,
np.str_,
object,
np.dtype(object),
}:
if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)):
dtype = dtype.to_pandas()
elif dtype == object:
# Unlike pandas, interpret object as "str" instead of "python object"
dtype = "str"
ser = pd.Series(arbitrary, dtype=dtype)
return as_column(ser, nan_as_null=nan_as_null)
elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
Expand All @@ -2140,166 +2154,72 @@ def as_column(
return cudf.core.column.ListColumn.from_sequences(arbitrary)
raise
return as_column(data, nan_as_null=nan_as_null)
else:
pa_type = None
elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
# TODO: This validation should probably be done earlier?
raise TypeError(
f"{type(arbitrary).__name__} must be an iterable or sequence."
)
from_pandas = nan_as_null is None or nan_as_null
if dtype is not None:
dtype = cudf.dtype(dtype)
try:
if dtype is not None:
if is_datetime64_dtype(dtype):
# Error checking only, actual construction happens
# below.
pa_array = pa.array(arbitrary)
if (
isinstance(pa_array.type, pa.TimestampType)
and pa_array.type.tz is not None
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware "
"datetimes"
)
if is_bool_dtype(dtype):
# Need this special case handling for bool dtypes,
# since 'boolean' & 'pd.BooleanDtype' are not
# understood by np.dtype below.
dtype = "bool"
np_dtype = np.dtype(dtype)
if np_dtype.kind in {"m", "M"}:
unit = np.datetime_data(np_dtype)[0]
if unit not in {"ns", "us", "ms", "s", "D"}:
raise NotImplementedError(
f"{dtype=} is not supported."
)
pa_type = np_to_pa_dtype(np_dtype)
else:
# By default cudf constructs a 64-bit column. Setting
# the `default_*_bitwidth` to 32 will result in a 32-bit
# column being created.
if (
cudf.get_option("default_integer_bitwidth")
and infer_dtype(arbitrary) == "integer"
):
pa_type = np_to_pa_dtype(
_maybe_convert_to_default_type("int")
)
if cudf.get_option("default_float_bitwidth") and infer_dtype(
arbitrary
) in (
"floating",
"mixed-integer-float",
):
pa_type = np_to_pa_dtype(
_maybe_convert_to_default_type("float")
)

pyarrow_array = pa.array(
arbitrary = pa.array(
arbitrary,
type=pa_type,
from_pandas=True if nan_as_null is None else nan_as_null,
type=cudf_dtype_to_pa_type(dtype),
from_pandas=from_pandas,
)

if (
isinstance(pyarrow_array, pa.NullArray)
and pa_type is None
and dtype is None
and getattr(arbitrary, "dtype", None) == cudf.dtype("object")
):
# pa.array constructor returns a NullArray
# for empty arrays, instead of a StringArray.
# This issue is only specific to this dtype,
# all other dtypes, result in their corresponding
# arrow array creation.
dtype = cudf.dtype("str")
pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))

except (pa.ArrowInvalid, pa.ArrowTypeError):
if not isinstance(dtype, np.dtype):
dtype = dtype.to_pandas()
arbitrary = pd.Series(arbitrary, dtype=dtype)
return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
else:
arbitrary = list(arbitrary)
for element in arbitrary:
# Carve-outs that cannot be parsed by pyarrow/pandas
if is_column_like(element):
# e.g. test_nested_series_from_sequence_data
return cudf.core.column.ListColumn.from_sequences(arbitrary)
elif isinstance(element, cupy.ndarray):
# e.g. test_series_from_cupy_scalars
return as_column(
cupy.array(arbitrary),
dtype=dtype,
nan_as_null=nan_as_null,
length=length,
)
elif not any(element is na for na in (None, pd.NA, np.nan)):
# Might have NA + element like above, but short-circuit if
# an element pyarrow/pandas might be able to parse
break
try:
arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
if (
cudf.get_option("mode.pandas_compatible")
and pa.types.is_integer(pyarrow_array.type)
and pyarrow_array.null_count
and pa.types.is_integer(arbitrary.type)
and arbitrary.null_count > 0
):
pyarrow_array = pyarrow_array.cast("float64").fill_null(np.nan)

data = as_column(
pyarrow_array,
dtype=dtype,
nan_as_null=nan_as_null,
)
except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
if isinstance(e, MixedTypeError):
raise TypeError(str(e))
arbitrary = arbitrary.cast(pa.float64())
if cudf.get_option(
"default_integer_bitwidth"
) and pa.types.is_integer(arbitrary.type):
dtype = _maybe_convert_to_default_type("int")
elif cudf.get_option(
"default_float_bitwidth"
) and pa.types.is_floating(arbitrary.type):
dtype = _maybe_convert_to_default_type("float")
except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
arbitrary = pd.Series(arbitrary)
if cudf.get_option(
"default_integer_bitwidth"
) and arbitrary.dtype.kind in set("iu"):
dtype = _maybe_convert_to_default_type("int")
elif (
isinstance(arbitrary, Sequence)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary
)
cudf.get_option("default_float_bitwidth")
and arbitrary.dtype.kind == "f"
):
# TODO: I think can be removed; covered by
# elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
# above
return cudf.core.column.ListColumn.from_sequences(arbitrary)
elif isinstance(arbitrary, abc.Iterable) or isinstance(
arbitrary, abc.Sequence
):
data = as_column(
_construct_array(arbitrary, dtype),
dtype=dtype,
nan_as_null=nan_as_null,
)
else:
raise e
return data


def _construct_array(
arbitrary: Any, dtype: Optional[Dtype]
) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
"""
Construct a CuPy/NumPy/Pandas array from `arbitrary`
"""
try:
dtype = dtype if dtype is None else cudf.dtype(dtype)
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
except (TypeError, ValueError):
native_dtype = dtype
inferred_dtype = infer_dtype(arbitrary, skipna=False)
if (
dtype is None
and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
and inferred_dtype
in (
"mixed",
"mixed-integer",
)
):
native_dtype = "object"
if inferred_dtype == "interval":
# Only way to construct an Interval column.
return pd.array(arbitrary)
elif (
inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
):
# We may have date-like strings with timezones
try:
with warnings.catch_warnings():
# Need to ignore userwarnings when
# datetime format cannot be inferred.
warnings.simplefilter("ignore", UserWarning)
pd_arbitrary = pd.to_datetime(arbitrary)
if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
return pd_arbitrary.to_numpy()
except pd.errors.OutOfBoundsDatetime:
# https://github.com/pandas-dev/pandas/issues/55096
pass

arbitrary = np.asarray(
arbitrary,
dtype=native_dtype
if native_dtype is None
else np.dtype(native_dtype),
)
return arbitrary
dtype = _maybe_convert_to_default_type("float")
return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)


def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1714,7 +1714,15 @@ def __init__(
raise TypeError("dtype must be a datetime type")

name = _setdefault_name(data, name=name)["name"]
data = column.as_column(data, dtype=dtype)
data = column.as_column(data)

# TODO: Remove this if statement and fix tests now that
# there's timezone support
if isinstance(data.dtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = data.astype(dtype)

if copy:
data = data.copy()
Expand Down
Loading

0 comments on commit e928c4a

Please sign in to comment.