Skip to content

Commit 564b7d2

Browse files
authored
PERF: DataFrame.astype where dtype is an ExtensionDtype (#54299)
1 parent aefede9 commit 564b7d2

File tree

3 files changed

+43
-6
lines changed

3 files changed

+43
-6
lines changed

Diff for: asv_bench/benchmarks/frame_methods.py

+34
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,40 @@
1717
from .pandas_vb_common import tm
1818

1919

20+
class AsType:
21+
params = [
22+
[
23+
# from_dtype == to_dtype
24+
("Float64", "Float64"),
25+
("float64[pyarrow]", "float64[pyarrow]"),
26+
# from non-EA to EA
27+
("float64", "Float64"),
28+
("float64", "float64[pyarrow]"),
29+
# from EA to non-EA
30+
("Float64", "float64"),
31+
("float64[pyarrow]", "float64"),
32+
# from EA to EA
33+
("Int64", "Float64"),
34+
("int64[pyarrow]", "float64[pyarrow]"),
35+
],
36+
[False, True],
37+
]
38+
param_names = ["from_to_dtypes", "copy"]
39+
40+
def setup(self, from_to_dtypes, copy):
41+
from_dtype = from_to_dtypes[0]
42+
if from_dtype in ("float64", "Float64", "float64[pyarrow]"):
43+
data = np.random.randn(100, 100)
44+
elif from_dtype in ("int64", "Int64", "int64[pyarrow]"):
45+
data = np.random.randint(0, 1000, (100, 100))
46+
else:
47+
raise NotImplementedError
48+
self.df = DataFrame(data, dtype=from_dtype)
49+
50+
def time_astype(self, from_to_dtypes, copy):
51+
self.df.astype(from_to_dtypes[1], copy=copy)
52+
53+
2054
class Clip:
2155
params = [
2256
["float64", "Float64", "float64[pyarrow]"],

Diff for: doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ Performance improvements
451451
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
452452
- Performance improvement in :func:`concat` when the concatenation axis is a :class:`MultiIndex` (:issue:`53574`)
453453
- Performance improvement in :meth:`.DataFrameGroupBy.groups` (:issue:`53088`)
454+
- Performance improvement in :meth:`DataFrame.astype` when ``dtype`` is an extension dtype (:issue:`54299`)
454455
- Performance improvement in :meth:`DataFrame.isin` for extension dtypes (:issue:`53514`)
455456
- Performance improvement in :meth:`DataFrame.loc` when selecting rows and columns (:issue:`53014`)
456457
- Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single masked dtype, e.g. :class:`Int64` (:issue:`52836`)

Diff for: pandas/core/generic.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -6511,13 +6511,15 @@ def astype(
65116511
results.append(res_col)
65126512

65136513
elif is_extension_array_dtype(dtype) and self.ndim > 1:
6514-
# GH 18099/22869: columnwise conversion to extension dtype
6515-
# GH 24704: use iloc to handle duplicate column names
65166514
# TODO(EA2D): special case not needed with 2D EAs
6517-
results = [
6518-
self.iloc[:, i].astype(dtype, copy=copy)
6519-
for i in range(len(self.columns))
6520-
]
6515+
dtype = pandas_dtype(dtype)
6516+
if isinstance(dtype, ExtensionDtype) and all(
6517+
arr.dtype == dtype for arr in self._mgr.arrays
6518+
):
6519+
return self.copy(deep=copy)
6520+
# GH 18099/22869: columnwise conversion to extension dtype
6521+
# GH 24704: self.items handles duplicate column names
6522+
results = [ser.astype(dtype, copy=copy) for _, ser in self.items()]
65216523

65226524
else:
65236525
# else, only a single dtype is given

0 commit comments

Comments
 (0)