Skip to content

Commit

Permalink
Ensure DataFrame column types are preserved during serialization (#14705
Browse files Browse the repository at this point in the history
)

xref rapidsai/ucxx#149

Ensures column subclass e.g `RangeIndex` and data types e.g. `int8` are preserved during serialization.

I think this should be backwards compatible since we're just adding keys to the the serialized `header` dict and the deserialization checks if those new keys exist

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: #14705
  • Loading branch information
mroeschke authored Jan 9, 2024
1 parent 9c9de7c commit d81ca78
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 10 deletions.
22 changes: 20 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -99,9 +99,14 @@ def _has_nulls(self):

@_cudf_nvtx_annotate
def serialize(self):
# TODO: See if self._data can be serialized outright
header = {
"type-serialized": pickle.dumps(type(self)),
"column_names": pickle.dumps(tuple(self._data.names)),
"column_rangeindex": pickle.dumps(self._data.rangeindex),
"column_multiindex": pickle.dumps(self._data.multiindex),
"column_label_dtype": pickle.dumps(self._data.label_dtype),
"column_level_names": pickle.dumps(self._data._level_names),
}
header["columns"], frames = serialize_columns(self._columns)
return header, frames
Expand All @@ -112,7 +117,20 @@ def deserialize(cls, header, frames):
cls_deserialize = pickle.loads(header["type-serialized"])
column_names = pickle.loads(header["column_names"])
columns = deserialize_columns(header["columns"], frames)
return cls_deserialize._from_data(dict(zip(column_names, columns)))
kwargs = {}
for metadata in [
"rangeindex",
"multiindex",
"label_dtype",
"level_names",
]:
key = f"column_{metadata}"
if key in header:
kwargs[metadata] = pickle.loads(header[key])
col_accessor = ColumnAccessor(
data=dict(zip(column_names, columns)), **kwargs
)
return cls_deserialize._from_data(col_accessor)

@classmethod
@_cudf_nvtx_annotate
Expand Down
28 changes: 20 additions & 8 deletions python/cudf/cudf/tests/test_serialize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

import pickle

Expand All @@ -8,7 +8,6 @@
import pytest

import cudf
from cudf.core._compat import PANDAS_GE_200
from cudf.testing import _utils as utils
from cudf.testing._utils import assert_eq

Expand Down Expand Up @@ -302,12 +301,9 @@ def test_serialize_string():
"frames",
[
(cudf.Series([], dtype="str"), pd.Series([], dtype="str")),
pytest.param(
(cudf.DataFrame([]), pd.DataFrame([])),
marks=pytest.mark.xfail(
not PANDAS_GE_200, reason=".column returns Index[object]"
),
),
(cudf.DataFrame(), pd.DataFrame()),
(cudf.DataFrame([]), pd.DataFrame([])),
(cudf.DataFrame({}), pd.DataFrame({})),
(cudf.DataFrame([1]).head(0), pd.DataFrame([1]).head(0)),
(cudf.DataFrame({"a": []}), pd.DataFrame({"a": []})),
(
Expand Down Expand Up @@ -401,3 +397,19 @@ def test_serialize_sliced_string():

recreated = cudf.Series.deserialize(*sliced.serialize())
assert_eq(recreated.to_pandas(nullable=True), pd_series)


@pytest.mark.parametrize(
"columns",
[
cudf.RangeIndex(2),
cudf.Index([1, 2], dtype="int8"),
cudf.MultiIndex(
levels=[["a", "b"], [1, 2]], codes=[[0, 1], [0, 1]], names=["a", 0]
),
],
)
def test_serialize_column_types_preserved(columns):
expected = cudf.DataFrame([[10, 11]], columns=columns)
result = cudf.DataFrame.deserialize(*expected.serialize())
assert_eq(result, expected)

0 comments on commit d81ca78

Please sign in to comment.