From cbbaf20b34feab95fecba2daeb329947a703173d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 1 Mar 2021 11:28:37 -0800 Subject: [PATCH] TYP: to_arrays, BUG: from_records empty dtypes (#40121) --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/core/frame.py | 2 ++ pandas/core/internals/construction.py | 10 +++++++--- .../frame/constructors/test_from_records.py | 17 +++++++++-------- pandas/tests/frame/test_constructors.py | 3 ++- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6878227f6ae9c..41db72612a66b 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -401,7 +401,7 @@ Conversion ^^^^^^^^^^ - Bug in :meth:`Series.to_dict` with ``orient='records'`` now returns python native types (:issue:`25969`) - Bug in :meth:`Series.view` and :meth:`Index.view` when converting between datetime-like (``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64``, ``period``) dtypes (:issue:`39788`) -- +- Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) - Strings diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd3818af9ea9c..830a7f4347132 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -605,6 +605,8 @@ def __init__( if is_dataclass(data[0]): data = dataclasses_to_dicts(data) if treat_as_nested(data): + if columns is not None: + columns = ensure_index(columns) arrays, columns, index = nested_data_to_arrays( data, columns, index, dtype ) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 8cb6d692e070c..7eade970253bf 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -602,7 +602,9 @@ def dataclasses_to_dicts(data): # Conversion of Inputs to Arrays -def to_arrays(data, columns: Optional[Index], dtype: Optional[DtypeObj] = None): +def to_arrays( + data, columns: Optional[Index], dtype: Optional[DtypeObj] = None +) -> Tuple[List[ArrayLike], Index]: """ Return list of arrays, columns. """ @@ -623,8 +625,10 @@ def to_arrays(data, columns: Optional[Index], dtype: Optional[DtypeObj] = None): if isinstance(data, np.ndarray): columns = data.dtype.names if columns is not None: - return [[]] * len(columns), columns - return [], [] # columns if columns is not None else [] + # i.e. numpy structured array + arrays = [data[name] for name in columns] + return arrays, ensure_index(columns) + return [], ensure_index([]) elif isinstance(data[0], Categorical): if columns is None: diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 0d36f3bd80e26..1cda4b1948c6a 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -11,6 +11,7 @@ CategoricalIndex, DataFrame, Index, + Int64Index, Interval, RangeIndex, Series, @@ -437,11 +438,11 @@ def test_from_records_empty(self): def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[("id", np.int64), ("value", np.int64)]) df = DataFrame.from_records(a, index="id") - tm.assert_index_equal(df.index, Index([1], name="id")) - assert df.index.name == "id" - tm.assert_index_equal(df.columns, Index(["value"])) - - b = np.array([], dtype=[("id", np.int64), ("value", np.int64)]) - df = DataFrame.from_records(b, index="id") - tm.assert_index_equal(df.index, Index([], name="id")) - assert df.index.name == "id" + + ex_index = Int64Index([1], name="id") + expected = DataFrame({"value": [2]}, index=ex_index, columns=["value"]) + tm.assert_frame_equal(df, expected) + + b = a[:0] + df2 = DataFrame.from_records(b, index="id") + tm.assert_frame_equal(df2, df.iloc[:0]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 4f32cec001c5a..3bbe5f9e46efa 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1170,7 +1170,8 @@ def test_constructor_unequal_length_nested_list_column(self): # GH 32173 arrays = [list("abcd"), list("cde")] - msg = "Length of columns passed for MultiIndex columns is different" + # exception raised inside MultiIndex constructor + msg = "all arrays must be same length" with pytest.raises(ValueError, match=msg): DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)