From 51b363ec3f2e78b8da1d10a48872c9a6cc010cbe Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 15 Feb 2025 18:06:25 -0500 Subject: [PATCH 1/8] ENH(string dtype): Implement cumsum for Python-backed strings --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 94 ++++++++++++++++++++++++ pandas/tests/apply/test_str.py | 15 ++-- pandas/tests/extension/test_string.py | 6 +- pandas/tests/groupby/test_categorical.py | 12 +-- pandas/tests/series/test_cumulative.py | 6 +- 6 files changed, 113 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8bdddb5b7f85d..1bd818a200928 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,7 +37,7 @@ Other enhancements updated to work correctly with NumPy >= 2 (:issue:`57739`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`???`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 623a6a10c75b5..a472339998944 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -870,6 +870,100 @@ def _reduce( raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray: + """ + Return an ExtensionArray performing an accumulation operation. + + The underlying data type might change. + + Parameters + ---------- + name : str + Name of the function, supported values are: + - cummin + - cummax + - cumsum + - cumprod + skipna : bool, default True + If True, skip NA values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, there is no supported kwarg. + + Returns + ------- + array + + Raises + ------ + NotImplementedError : subclass does not define accumulations + """ + if is_string_dtype(self): + return self._str_accumulate(name=name, skipna=skipna, **kwargs) + return super()._accumulate(name=name, skipna=skipna, **kwargs) + + def _str_accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> StringArray: + """ + Accumulate implementation for strings, see `_accumulate` docstring for details. + + pyarrow.compute does not implement these methods for strings. + """ + if name == "cumprod": + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) + + # We may need to strip out trailing NA values + tail: np.array | None = None + na_mask: np.array | None = None + ndarray = self._ndarray + np_func = { + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + "cummax": np.maximum.accumulate, + }[name] + + from pandas.core import missing + + if self._hasna: + na_mask = isna(ndarray) + if np.all(na_mask): + return type(self)(ndarray) + if skipna: + if name == "cumsum": + ndarray = np.where(na_mask, "", ndarray) + else: + # We can retain the running min/max by forward/backward filling. + ndarray = ndarray.copy() + missing.pad_or_backfill_inplace( + ndarray.T, + method="pad", + axis=0, + ) + missing.pad_or_backfill_inplace( + ndarray.T, + method="backfill", + axis=0, + ) + else: + # When not skipping NA values, the result should be null from + # the first NA value onward. + idx = np.argmax(na_mask) + tail = np.empty(len(ndarray) - idx, dtype="object") + tail[:] = np.nan + ndarray = ndarray[:idx] + + np_result = np_func(ndarray) + + if tail is not None: + np_result = np.hstack((np_result, tail)) + elif na_mask is not None: + np_result = np.where(na_mask, np.nan, np_result) + + result = type(self)(np_result) + return result + def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any: if self.dtype.na_value is np.nan and result is libmissing.NA: # the masked_reductions use pd.NA -> convert to np.nan diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index ce71cfec535e4..b7c454817cb53 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -5,7 +5,6 @@ import pytest from pandas.compat import ( - HAS_PYARROW, WASM, ) @@ -166,13 +165,13 @@ def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: - request.applymarker( - pytest.mark.xfail( - raises=NotImplementedError, - reason="TODO(infer_string) cumsum not yet implemented for string", - ) - ) + # if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: + # request.applymarker( + # pytest.mark.xfail( + # raises=NotImplementedError, + # reason="TODO(infer_string) cumsum not yet implemented for string", + # ) + # ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 6ce48e434d329..25129111180d6 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -196,11 +196,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool: assert isinstance(ser.dtype, StorageExtensionDtype) - return ser.dtype.storage == "pyarrow" and op_name in [ - "cummin", - "cummax", - "cumsum", - ] + return op_name in ["cummin", "cummax", "cumsum"] def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = cast(StringDtype, tm.get_dtype(obj)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..6f110ee8f8f21 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -325,9 +325,9 @@ def test_observed(request, using_infer_string, observed): # gh-8138 (back-compat) # gh-8869 - if using_infer_string and not observed: - # TODO(infer_string) this fails with filling the string column with 0 - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + # if using_infer_string and not observed: + # # TODO(infer_string) this fails with filling the string column with 0 + # request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) @@ -355,10 +355,12 @@ def test_observed(request, using_infer_string, observed): ) result = gb.sum() if not observed: + fill_value = "" if using_infer_string else 0 expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value=0 + expected, [cat1, cat2], list("AB"), fill_value=fill_value ) - + print(result) + print(expected) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 89882d9d797c5..9357c1f278b75 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -266,12 +266,12 @@ def test_cumprod_timedelta(self): ], ) def test_cum_methods_pyarrow_strings( - self, pyarrow_string_dtype, data, op, skipna, expected_data + self, string_dtype_no_object, data, op, skipna, expected_data ): # https://github.com/pandas-dev/pandas/pull/60633 - ser = pd.Series(data, dtype=pyarrow_string_dtype) + ser = pd.Series(data, dtype=string_dtype_no_object) method = getattr(ser, op) - expected = pd.Series(expected_data, dtype=pyarrow_string_dtype) + expected = pd.Series(expected_data, dtype=string_dtype_no_object) result = method(skipna=skipna) tm.assert_series_equal(result, expected) From 188f92e349faa457bdbb838f71ef3ee33e9113f8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 15 Feb 2025 18:10:11 -0500 Subject: [PATCH 2/8] cleanups --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 5 +---- pandas/tests/apply/test_str.py | 9 +-------- pandas/tests/groupby/test_categorical.py | 11 ++++------- pandas/tests/series/test_cumulative.py | 5 +++-- 5 files changed, 10 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 1bd818a200928..c21f8e8c46a6d 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,7 +37,7 @@ Other enhancements updated to work correctly with NumPy >= 2 (:issue:`57739`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) -- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`???`) +- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`, :issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a472339998944..10b2a62828ddb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -49,6 +49,7 @@ ) from pandas.core import ( + missing, nanops, ops, ) @@ -907,8 +908,6 @@ def _str_accumulate( ) -> StringArray: """ Accumulate implementation for strings, see `_accumulate` docstring for details. - - pyarrow.compute does not implement these methods for strings. """ if name == "cumprod": msg = f"operation '{name}' not supported for dtype '{self.dtype}'" @@ -924,8 +923,6 @@ def _str_accumulate( "cummax": np.maximum.accumulate, }[name] - from pandas.core import missing - if self._hasna: na_mask = isna(ndarray) if np.all(na_mask): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index b7c454817cb53..e5a9492630b13 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -161,17 +161,10 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(request, series, func, expected): +def test_agg_cython_table_transform_series(series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) - # if series.dtype == "string" and func == "cumsum" and not HAS_PYARROW: - # request.applymarker( - # pytest.mark.xfail( - # raises=NotImplementedError, - # reason="TODO(infer_string) cumsum not yet implemented for string", - # ) - # ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 6f110ee8f8f21..e050d475b744f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -325,9 +325,9 @@ def test_observed(request, using_infer_string, observed): # gh-8138 (back-compat) # gh-8869 - # if using_infer_string and not observed: - # # TODO(infer_string) this fails with filling the string column with 0 - # request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) @@ -355,12 +355,9 @@ def test_observed(request, using_infer_string, observed): ) result = gb.sum() if not observed: - fill_value = "" if using_infer_string else 0 expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value=fill_value + expected, [cat1, cat2], list("AB"), fill_value=0 ) - print(result) - print(expected) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 9357c1f278b75..db83cf1112e74 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -265,10 +265,11 @@ def test_cumprod_timedelta(self): ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]), ], ) - def test_cum_methods_pyarrow_strings( + def test_cum_methods_ea_strings( self, string_dtype_no_object, data, op, skipna, expected_data ): - # https://github.com/pandas-dev/pandas/pull/60633 + # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow + # https://github.com/pandas-dev/pandas/pull/60938 - Python ser = pd.Series(data, dtype=string_dtype_no_object) method = getattr(ser, op) expected = pd.Series(expected_data, dtype=string_dtype_no_object) From c04969e61ad4ac13a8ddb4f227571925612ba64f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 15 Feb 2025 18:11:33 -0500 Subject: [PATCH 3/8] cleanups --- pandas/core/arrays/string_.py | 10 ---------- pandas/tests/groupby/test_categorical.py | 1 + 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 10b2a62828ddb..69e3be87b2b9b 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -899,16 +899,6 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra ------ NotImplementedError : subclass does not define accumulations """ - if is_string_dtype(self): - return self._str_accumulate(name=name, skipna=skipna, **kwargs) - return super()._accumulate(name=name, skipna=skipna, **kwargs) - - def _str_accumulate( - self, name: str, *, skipna: bool = True, **kwargs - ) -> StringArray: - """ - Accumulate implementation for strings, see `_accumulate` docstring for details. - """ if name == "cumprod": msg = f"operation '{name}' not supported for dtype '{self.dtype}'" raise TypeError(msg) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e050d475b744f..e49be8c00b426 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -358,6 +358,7 @@ def test_observed(request, using_infer_string, observed): expected = cartesian_product_for_groupers( expected, [cat1, cat2], list("AB"), fill_value=0 ) + tm.assert_frame_equal(result, expected) From 7e116ef90e1cbd131f75bd699c46d2feb9ec9d7e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 12:46:31 -0500 Subject: [PATCH 4/8] type-hint fixup --- pandas/core/arrays/string_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 69e3be87b2b9b..806db64d107ee 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -904,8 +904,8 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra raise TypeError(msg) # We may need to strip out trailing NA values - tail: np.array | None = None - na_mask: np.array | None = None + tail: np.ndarray | None = None + na_mask: np.ndarray | None = None ndarray = self._ndarray np_func = { "cumsum": np.cumsum, From 0629fcfcc2bc7b9eb422a960c2985af5d55ad1ba Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 13:16:07 -0500 Subject: [PATCH 5/8] More type fixes --- pandas/core/arrays/string_.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 806db64d107ee..9c46f2c8c6b87 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -23,6 +23,7 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array +from pandas._typing import npt from pandas.compat import ( HAS_PYARROW, pa_version_under10p1, @@ -83,7 +84,6 @@ NumpyValueArrayLike, Scalar, Self, - npt, type_t, ) @@ -914,7 +914,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra }[name] if self._hasna: - na_mask = isna(ndarray) + na_mask = cast(npt.NDArray[np.bool_], isna(ndarray)) if np.all(na_mask): return type(self)(ndarray) if skipna: @@ -941,7 +941,8 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra tail[:] = np.nan ndarray = ndarray[:idx] - np_result = np_func(ndarray) + # mypy: Cannot call function of unknown type + np_result = np_func(ndarray) # type: ignore[operator] if tail is not None: np_result = np.hstack((np_result, tail)) From d0f6673d94ac3e7cf63c62ef8ff4504d3cef0d01 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 14:02:34 -0500 Subject: [PATCH 6/8] Use quotes for cast --- pandas/core/arrays/string_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9c46f2c8c6b87..c8cba904e802d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -23,7 +23,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import ensure_string_array -from pandas._typing import npt from pandas.compat import ( HAS_PYARROW, pa_version_under10p1, @@ -84,6 +83,7 @@ NumpyValueArrayLike, Scalar, Self, + npt, type_t, ) @@ -914,7 +914,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra }[name] if self._hasna: - na_mask = cast(npt.NDArray[np.bool_], isna(ndarray)) + na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray)) if np.all(na_mask): return type(self)(ndarray) if skipna: From 7f9571de0f13eb37630ae2a2d3e9d65adb1e978f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 17 Feb 2025 16:21:09 -0500 Subject: [PATCH 7/8] Refinements --- pandas/core/arrays/string_.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c8cba904e802d..6a57762b6c6bf 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -924,12 +924,12 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra # We can retain the running min/max by forward/backward filling. ndarray = ndarray.copy() missing.pad_or_backfill_inplace( - ndarray.T, + ndarray, method="pad", axis=0, ) missing.pad_or_backfill_inplace( - ndarray.T, + ndarray, method="backfill", axis=0, ) @@ -938,7 +938,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra # the first NA value onward. idx = np.argmax(na_mask) tail = np.empty(len(ndarray) - idx, dtype="object") - tail[:] = np.nan + tail[:] = self.dtype.na_value ndarray = ndarray[:idx] # mypy: Cannot call function of unknown type @@ -947,7 +947,7 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra if tail is not None: np_result = np.hstack((np_result, tail)) elif na_mask is not None: - np_result = np.where(na_mask, np.nan, np_result) + np_result = np.where(na_mask, self.dtype.na_value, np_result) result = type(self)(np_result) return result From 2fd977937e12e271256b2fe073c848ae47e2068a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 17 Feb 2025 17:30:13 -0500 Subject: [PATCH 8/8] type-ignore --- pandas/core/arrays/string_.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6a57762b6c6bf..7227ea77ca433 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -947,7 +947,8 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArra if tail is not None: np_result = np.hstack((np_result, tail)) elif na_mask is not None: - np_result = np.where(na_mask, self.dtype.na_value, np_result) + # Argument 2 to "where" has incompatible type "NAType | float" + np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type] result = type(self)(np_result) return result