Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix type of empty Index and raise warning in Series constructor #14116

Merged
merged 5 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import Index, RangeIndex
from cudf.core.index import RangeIndex, as_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.core.series import Series
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column


def factorize(
Expand Down Expand Up @@ -95,7 +96,13 @@ def factorize(

return_cupy_array = isinstance(values, cp.ndarray)

values = Series(values)
if not can_convert_to_column(values):
raise TypeError(
"'values' can only be a Series, Index, or CuPy array, "
f"got {type(values)}"
)

values = as_column(values)

if na_sentinel is None:
na_sentinel = (
Expand Down Expand Up @@ -128,22 +135,22 @@ def factorize(
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel is None or use_na_sentinel:
cats = values._column.dropna()
cats = values.dropna()
else:
cats = values._column
cats = values

cats = cats.unique().astype(values.dtype)

if sort:
cats = cats.sort_values()

labels = values._column._label_encoding(
labels = values._label_encoding(
cats=cats,
na_sentinel=Scalar(na_sentinel),
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else Index(cats)
return labels, cats.values if return_cupy_array else as_index(cats)


def _linear_interpolation(column, index=None):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5606,7 +5606,7 @@ def quantile(
result.name = q
return result

result.index = list(map(float, qs))
result.index = cudf.Index(list(map(float, qs)), dtype="float64")
return result

@_cudf_nvtx_annotate
Expand Down
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
List,
MutableMapping,
Optional,
Sequence,
Tuple,
Type,
Union,
Expand Down Expand Up @@ -3467,14 +3468,23 @@ def __new__(
"tupleize_cols != True is not yet supported"
)

return as_index(
res = as_index(
data,
copy=copy,
dtype=dtype,
name=name,
nan_as_null=nan_as_null,
**kwargs,
)
if (
isinstance(data, Sequence)
and not isinstance(data, range)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
return res.astype("str")
return res

@classmethod
@_cudf_nvtx_annotate
Expand Down
32 changes: 29 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@
import warnings
from collections import abc
from shutil import get_terminal_size
from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
from typing import (
Any,
Dict,
MutableMapping,
Optional,
Sequence,
Set,
Tuple,
Union,
)

import cupy
import numpy as np
Expand Down Expand Up @@ -500,6 +509,18 @@ def __init__(
copy=False,
nan_as_null=True,
):
if (
isinstance(data, Sequence)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
warnings.warn(
"The default dtype for empty Series will be 'object' instead "
"of 'float64' in a future version. Specify a dtype explicitly "
"to silence this warning.",
FutureWarning,
)
if isinstance(data, pd.Series):
if name is None:
name = data.name
Expand Down Expand Up @@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
3 NaN
dtype: float64
"""
return cls(s, nan_as_null=nan_as_null)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = cls(s, nan_as_null=nan_as_null)
return result

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
if len(val_counts) > 0:
val_counts = val_counts[val_counts == val_counts.iloc[0]]

return Series(val_counts.index.sort_values(), name=self.name)
return Series._from_data(
{self.name: val_counts.index.sort_values()}, name=self.name
)

@_cudf_nvtx_annotate
def round(self, decimals=0, how="half_even"):
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,15 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


def _create_cudf_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around cudf.Series using a float64 default dtype for empty data.
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
Expand Down
17 changes: 9 additions & 8 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ALL_TYPES,
DATETIME_TYPES,
NUMERIC_TYPES,
_create_cudf_series,
assert_eq,
assert_exceptions_equal,
assert_neq,
Expand Down Expand Up @@ -2001,7 +2002,7 @@ def test_series_shape():

def test_series_shape_empty():
ps = pd.Series(dtype="float64")
cs = cudf.Series([])
cs = _create_cudf_series([])
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

assert ps.shape == cs.shape

Expand Down Expand Up @@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
@pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
def test_series_all_valid_nan(num_elements):
data = [np.nan] * num_elements
sr = cudf.Series(data, nan_as_null=False)
sr = _create_cudf_series(data, nan_as_null=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this change needed?

For num_elements == 0, the null count should still be zero even if it interprets the dtype as string/object (so the test is still valid, though it might warn?).

For num_elements > 0, the data should be non-empty and thus be correctly interpreted as a floating type.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For num_elements == 0, the null count should still be zero even if it interprets the dtype as string/object (so the test is still valid, though it might warn?).

Yup, this is the reason for this change.

np.testing.assert_equal(sr.null_count, 0)


Expand Down Expand Up @@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():


def test_as_column_types():
col = column.as_column(cudf.Series([]))
col = column.as_column(_create_cudf_series([]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here and below -- I'm not sure if I understand why these helper functions are needed. It seems like the test construction might need subtle changes but I don't think this casting helper function is the right answer. It's not announcing what it is actually doing -- the differences between cudf.Series and _create_cudf_series should be in the name, if this is needed (like _create_cudf_series_float64_default).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, reverted the change here and passed explicit dtypes. Renamed the helper to _create_cudf_series_float64_default and _create_pandas_series_float64_default. These will be removed in pandas-2.0 upgrade.

assert_eq(col.dtype, np.dtype("float64"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float64"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="float32")
col = column.as_column(_create_cudf_series([]), dtype="float32")
assert_eq(col.dtype, np.dtype("float32"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float32"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="str")
col = column.as_column(_create_cudf_series([]), dtype="str")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="str"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="object")
col = column.as_column(_create_cudf_series([]), dtype="object")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="object"))
Expand Down Expand Up @@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
)
def test_series_values_host_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series(data)

np.testing.assert_array_equal(pds.values, gds.values_host)

Expand All @@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
)
def test_series_values_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series(data)
gds_vals = gds.values
assert isinstance(gds_vals, cupy.ndarray)
np.testing.assert_array_equal(gds_vals.get(), pds.values)
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
SIGNED_INTEGER_TYPES,
SIGNED_TYPES,
UNSIGNED_TYPES,
_create_cudf_series,
_create_pandas_series,
assert_column_memory_eq,
assert_column_memory_ne,
Expand Down Expand Up @@ -1007,7 +1008,7 @@ def test_index_equal_misc(data, other):
assert_eq(expected, actual)

expected = pd_data.equals(_create_pandas_series(pd_other))
actual = gd_data.equals(cudf.Series(gd_other))
actual = gd_data.equals(_create_cudf_series(gd_other))
assert_eq(expected, actual)

expected = pd_data.astype("category").equals(pd_other)
Expand Down Expand Up @@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
assert_eq(pidx, gidx)


def test_empty_index_init():
pidx = pd.Index([])
gidx = cudf.Index([])

assert_eq(pidx, gidx)


@pytest.mark.parametrize(
"data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
)
Expand Down
27 changes: 18 additions & 9 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
NUMERIC_TYPES,
SERIES_OR_INDEX_NAMES,
TIMEDELTA_TYPES,
_create_cudf_series,
_create_pandas_series,
assert_eq,
assert_exceptions_equal,
Expand Down Expand Up @@ -401,7 +402,7 @@ def test_series_tolist(data):
)
def test_series_size(data):
psr = _create_pandas_series(data)
gsr = cudf.Series(data)
gsr = cudf.Series(data, dtype="float64" if len(data) == 0 else None)

assert_eq(psr.size, gsr.size)

Expand Down Expand Up @@ -487,7 +488,7 @@ def test_series_describe_other_types(ps):
)
@pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
def test_series_factorize(data, na_sentinel):
gsr = cudf.Series(data)
gsr = _create_cudf_series(data)
psr = gsr.to_pandas()

with pytest.warns(FutureWarning):
Expand All @@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel):
)
@pytest.mark.parametrize("use_na_sentinel", [True, False])
def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
gsr = cudf.Series(data)
gsr = _create_cudf_series(data)
psr = gsr.to_pandas(nullable=True)

expected_labels, expected_cats = psr.factorize(
Expand All @@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
)
@pytest.mark.parametrize("sort", [True, False])
def test_series_factorize_sort(data, sort):
gsr = cudf.Series(data)
gsr = _create_cudf_series(data)
psr = gsr.to_pandas(nullable=True)

expected_labels, expected_cats = psr.factorize(sort=sort)
Expand Down Expand Up @@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize):
],
dtype="datetime64[ns]",
),
cudf.Series(name="empty series"),
cudf.Series(name="empty series", dtype="float64"),
cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"),
],
)
Expand Down Expand Up @@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method():


def test_set_index_unequal_length():
s = cudf.Series()
s = cudf.Series(dtype="float64")
with pytest.raises(ValueError):
s.index = [1, 2, 3]

Expand Down Expand Up @@ -1682,7 +1683,7 @@ def test_series_nunique_index(data):
],
)
def test_axes(data):
csr = cudf.Series(data)
csr = _create_cudf_series(data)
psr = csr.to_pandas()

expected = psr.axes
Expand Down Expand Up @@ -2099,7 +2100,7 @@ def test_series_to_dict(into):
],
)
def test_series_hasnans(data):
gs = cudf.Series(data, nan_as_null=False)
gs = _create_cudf_series(data, nan_as_null=False)
ps = gs.to_pandas(nullable=True)

assert_eq(gs.hasnans, ps.hasnans)
Expand Down Expand Up @@ -2171,7 +2172,7 @@ def test_series_init_dict_with_index(data, index):
)
def test_series_init_scalar_with_index(data, index):
pandas_series = _create_pandas_series(data, index=index)
cudf_series = cudf.Series(data, index=index)
cudf_series = _create_cudf_series(data, index=index)

assert_eq(
pandas_series,
Expand Down Expand Up @@ -2311,3 +2312,11 @@ def test_series_round_builtin(data, digits):
actual = round(gs, digits)

assert_eq(expected, actual)


def test_series_empty_warning():
with pytest.warns(FutureWarning):
expected = pd.Series([])
with pytest.warns(FutureWarning):
actual = cudf.Series([])
assert_eq(expected, actual)
Loading