rapidsai · rapids-bot · Sep 20, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
@@ -4,12 +4,13 @@
 import cupy as cp
 import numpy as np
 
+from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, as_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
-from cudf.core.series import Series
 from cudf.options import get_option
+from cudf.utils.dtypes import can_convert_to_column
 
 
 def factorize(
@@ -95,7 +96,13 @@ def factorize(
 
     return_cupy_array = isinstance(values, cp.ndarray)
 
-    values = Series(values)
+    if not can_convert_to_column(values):
+        raise TypeError(
+            "'values' can only be a Series, Index, or CuPy array, "
+            f"got {type(values)}"
+        )
+
+    values = as_column(values)
 
     if na_sentinel is None:
         na_sentinel = (
@@ -128,22 +135,22 @@ def factorize(
         warnings.warn("size_hint is not applicable for cudf.factorize")
 
     if use_na_sentinel is None or use_na_sentinel:
-        cats = values._column.dropna()
+        cats = values.dropna()
     else:
-        cats = values._column
+        cats = values
 
     cats = cats.unique().astype(values.dtype)
 
     if sort:
         cats = cats.sort_values()
 
-    labels = values._column._label_encoding(
+    labels = values._label_encoding(
         cats=cats,
         na_sentinel=Scalar(na_sentinel),
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else as_index(cats)
 
 
 def _linear_interpolation(column, index=None):

@@ -5606,7 +5606,7 @@ def quantile(
                 result.name = q
                 return result
 
-        result.index = list(map(float, qs))
+        result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
     @_cudf_nvtx_annotate

@@ -13,6 +13,7 @@
     List,
     MutableMapping,
     Optional,
+    Sequence,
     Tuple,
     Type,
     Union,
@@ -3467,14 +3468,23 @@ def __new__(
                 "tupleize_cols != True is not yet supported"
             )
 
-        return as_index(
+        res = as_index(
             data,
             copy=copy,
             dtype=dtype,
             name=name,
             nan_as_null=nan_as_null,
             **kwargs,
         )
+        if (
+            isinstance(data, Sequence)
+            and not isinstance(data, range)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            return res.astype("str")
+        return res
 
     @classmethod
     @_cudf_nvtx_annotate

@@ -9,7 +9,16 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    MutableMapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -500,6 +509,18 @@ def __init__(
         copy=False,
         nan_as_null=True,
     ):
+        if (
+            isinstance(data, Sequence)
+            and len(data) == 0
+            and dtype is None
+            and getattr(data, "dtype", None) is None
+        ):
+            warnings.warn(
+                "The default dtype for empty Series will be 'object' instead "
+                "of 'float64' in a future version. Specify a dtype explicitly "
+                "to silence this warning.",
+                FutureWarning,
+            )
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
@@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
         3     NaN
         dtype: float64
         """
-        return cls(s, nan_as_null=nan_as_null)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            result = cls(s, nan_as_null=nan_as_null)
+        return result
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series(val_counts.index.sort_values(), name=self.name)
+        return Series._from_data(
+            {self.name: val_counts.index.sort_values()}, name=self.name
+        )
 
     @_cudf_nvtx_annotate
     def round(self, decimals=0, how="half_even"):

@@ -406,6 +406,15 @@ def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
     return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
 
 
+def _create_cudf_series(data=None, index=None, dtype=None, *args, **kwargs):
+    # Wrapper around cudf.Series using a float64 default dtype for empty data.
+    if dtype is None and (
+        data is None or (not is_scalar(data) and len(data) == 0)
+    ):
+        dtype = "float64"
+    return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)
+
+
 parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
     "left_dtype,right_dtype",
     list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),

@@ -30,6 +30,7 @@
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
+    _create_cudf_series,
     assert_eq,
     assert_exceptions_equal,
     assert_neq,
@@ -2001,7 +2002,7 @@ def test_series_shape():
 
 def test_series_shape_empty():
     ps = pd.Series(dtype="float64")
-    cs = cudf.Series([])
+    cs = _create_cudf_series([])
 
     assert ps.shape == cs.shape
 
@@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
 @pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
 def test_series_all_valid_nan(num_elements):
     data = [np.nan] * num_elements
-    sr = cudf.Series(data, nan_as_null=False)
+    sr = _create_cudf_series(data, nan_as_null=False)
     np.testing.assert_equal(sr.null_count, 0)
 
 
@@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():
 
 
 def test_as_column_types():
-    col = column.as_column(cudf.Series([]))
+    col = column.as_column(_create_cudf_series([]))
     assert_eq(col.dtype, np.dtype("float64"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="float32")
+    col = column.as_column(_create_cudf_series([]), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="str")
+    col = column.as_column(_create_cudf_series([]), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
-    col = column.as_column(cudf.Series([]), dtype="object")
+    col = column.as_column(_create_cudf_series([]), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
     gds = cudf.Series(col)
     pds = pd.Series(pd.Series([], dtype="object"))
@@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
 )
 def test_series_values_host_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series(data)
 
     np.testing.assert_array_equal(pds.values, gds.values_host)
 
@@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
 )
 def test_series_values_property(data):
     pds = pd.Series(data=data, dtype=None if data else float)
-    gds = cudf.Series(data)
+    gds = _create_cudf_series(data)
     gds_vals = gds.values
     assert isinstance(gds_vals, cupy.ndarray)
     np.testing.assert_array_equal(gds_vals.get(), pds.values)

@@ -30,6 +30,7 @@
     SIGNED_INTEGER_TYPES,
     SIGNED_TYPES,
     UNSIGNED_TYPES,
+    _create_cudf_series,
     _create_pandas_series,
     assert_column_memory_eq,
     assert_column_memory_ne,
@@ -1007,7 +1008,7 @@ def test_index_equal_misc(data, other):
     assert_eq(expected, actual)
 
     expected = pd_data.equals(_create_pandas_series(pd_other))
-    actual = gd_data.equals(cudf.Series(gd_other))
+    actual = gd_data.equals(_create_cudf_series(gd_other))
     assert_eq(expected, actual)
 
     expected = pd_data.astype("category").equals(pd_other)
@@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
     assert_eq(pidx, gidx)
 
 
+def test_empty_index_init():
+    pidx = pd.Index([])
+    gidx = cudf.Index([])
+
+    assert_eq(pidx, gidx)
+
+
 @pytest.mark.parametrize(
     "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
 )

@@ -19,6 +19,7 @@
     NUMERIC_TYPES,
     SERIES_OR_INDEX_NAMES,
     TIMEDELTA_TYPES,
+    _create_cudf_series,
     _create_pandas_series,
     assert_eq,
     assert_exceptions_equal,
@@ -401,7 +402,7 @@ def test_series_tolist(data):
 )
 def test_series_size(data):
     psr = _create_pandas_series(data)
-    gsr = cudf.Series(data)
+    gsr = cudf.Series(data, dtype="float64" if len(data) == 0 else None)
 
     assert_eq(psr.size, gsr.size)
 
@@ -487,7 +488,7 @@ def test_series_describe_other_types(ps):
 )
 @pytest.mark.parametrize("na_sentinel", [99999, 11, -1, 0])
 def test_series_factorize(data, na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series(data)
     psr = gsr.to_pandas()
 
     with pytest.warns(FutureWarning):
@@ -510,7 +511,7 @@ def test_series_factorize(data, na_sentinel):
 )
 @pytest.mark.parametrize("use_na_sentinel", [True, False])
 def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(
@@ -534,7 +535,7 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel):
 )
 @pytest.mark.parametrize("sort", [True, False])
 def test_series_factorize_sort(data, sort):
-    gsr = cudf.Series(data)
+    gsr = _create_cudf_series(data)
     psr = gsr.to_pandas(nullable=True)
 
     expected_labels, expected_cats = psr.factorize(sort=sort)
@@ -734,7 +735,7 @@ def test_series_value_counts_optional_arguments(ascending, dropna, normalize):
             ],
             dtype="datetime64[ns]",
         ),
-        cudf.Series(name="empty series"),
+        cudf.Series(name="empty series", dtype="float64"),
         cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"),
     ],
 )
@@ -1415,7 +1416,7 @@ def test_series_hash_values_invalid_method():
 
 
 def test_set_index_unequal_length():
-    s = cudf.Series()
+    s = cudf.Series(dtype="float64")
     with pytest.raises(ValueError):
         s.index = [1, 2, 3]
 
@@ -1682,7 +1683,7 @@ def test_series_nunique_index(data):
     ],
 )
 def test_axes(data):
-    csr = cudf.Series(data)
+    csr = _create_cudf_series(data)
     psr = csr.to_pandas()
 
     expected = psr.axes
@@ -2099,7 +2100,7 @@ def test_series_to_dict(into):
     ],
 )
 def test_series_hasnans(data):
-    gs = cudf.Series(data, nan_as_null=False)
+    gs = _create_cudf_series(data, nan_as_null=False)
     ps = gs.to_pandas(nullable=True)
 
     assert_eq(gs.hasnans, ps.hasnans)
@@ -2171,7 +2172,7 @@ def test_series_init_dict_with_index(data, index):
 )
 def test_series_init_scalar_with_index(data, index):
     pandas_series = _create_pandas_series(data, index=index)
-    cudf_series = cudf.Series(data, index=index)
+    cudf_series = _create_cudf_series(data, index=index)
 
     assert_eq(
         pandas_series,
@@ -2311,3 +2312,11 @@ def test_series_round_builtin(data, digits):
     actual = round(gs, digits)
 
     assert_eq(expected, actual)
+
+
+def test_series_empty_warning():
+    with pytest.warns(FutureWarning):
+        expected = pd.Series([])
+    with pytest.warns(FutureWarning):
+        actual = cudf.Series([])
+    assert_eq(expected, actual)