Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[backport 2.3.x] BUG (string dtype): replace with non-string to fall back to object dtype (#60285) #60292

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-

Interval
^^^^^^^^
Expand Down
43 changes: 25 additions & 18 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:

return arr, self.dtype.na_value

def __setitem__(self, key, value) -> None:
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
# extract_array doesn't extract NumpyExtensionArray subclasses
value = value._ndarray

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

# validate new items
if scalar_value:
def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
if lib.is_scalar(value):
if isna(value):
value = self.dtype.na_value
elif not isinstance(value, str):
Expand All @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None:
"instead."
)
else:
value = extract_array(value, extract_numpy=True)
if not is_array_like(value):
value = np.asarray(value, dtype=object)
elif isinstance(value.dtype, type(self.dtype)):
return value
else:
# cast categories and friends to arrays to see if values are
# compatible, compatibility with arrow backed strings
Expand All @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None:
"Invalid value for dtype 'str'. Value should be a "
"string or missing value (or array of those)."
)
return value

mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value
def __setitem__(self, key, value) -> None:
value = self._maybe_convert_setitem_value(value)

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

if not scalar_value:
if value.dtype == self.dtype:
value = value._ndarray
else:
value = np.asarray(value)
mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value

super().__setitem__(key, value)

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
except (ValueError, TypeError):
return False

if dtype == "string":
try:
arr._maybe_convert_setitem_value(element) # type: ignore[union-attr]
return True
except (ValueError, TypeError):
return False

# This is technically incorrect, but maintains the behavior of
# ExtensionBlock._can_hold_element
return True
Expand Down
40 changes: 32 additions & 8 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
ABCNumpyExtensionArray,
ABCSeries,
)
from pandas.core.dtypes.inference import is_re
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
Expand Down Expand Up @@ -115,6 +116,7 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.string_ import StringDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.computation import expressions
Expand Down Expand Up @@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
# Up/Down-casting

@final
def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
def coerce_to_target_dtype(
self, other, warn_on_upcast: bool = False, using_cow: bool = False
) -> Block:
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
Expand Down Expand Up @@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
f"{self.values.dtype}. Please report a bug at "
"https://github.com/pandas-dev/pandas/issues."
)
return self.astype(new_dtype, copy=False)
copy = False
if (
not using_cow
and isinstance(self.dtype, StringDtype)
and self.dtype.storage == "python"
):
copy = True
return self.astype(new_dtype, copy=copy, using_cow=using_cow)

@final
def _maybe_downcast(
Expand Down Expand Up @@ -879,7 +890,7 @@ def replace(
else:
return [self] if inplace else [self.copy()]

elif self._can_hold_element(value):
elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)):
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
blk = self._maybe_copy(using_cow, inplace)
Expand Down Expand Up @@ -926,12 +937,13 @@ def replace(
if value is None or value is NA:
blk = self.astype(np.dtype(object))
else:
blk = self.coerce_to_target_dtype(value)
blk = self.coerce_to_target_dtype(value, using_cow=using_cow)
return blk.replace(
to_replace=to_replace,
value=value,
inplace=True,
mask=mask,
using_cow=using_cow,
)

else:
Expand Down Expand Up @@ -980,16 +992,26 @@ def _replace_regex(
-------
List[Block]
"""
if not self._can_hold_element(to_replace):
if not is_re(to_replace) and not self._can_hold_element(to_replace):
# i.e. only if self.is_object is True, but could in principle include a
# String ExtensionBlock
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]

rx = re.compile(to_replace)
if is_re(to_replace) and self.dtype not in [object, "string"]:
# only object or string dtype can hold strings, and a regex object
# will only match strings
return [self.copy(deep=False)]

block = self._maybe_copy(using_cow, inplace)
if not (
self._can_hold_element(value) or (self.dtype == "string" and is_re(value))
):
block = self.astype(np.dtype(object))
else:
block = self._maybe_copy(using_cow, inplace)

rx = re.compile(to_replace)

replace_regex(block.values, rx, value, mask)

Expand Down Expand Up @@ -1048,7 +1070,9 @@ def replace_list(

# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
(x, y)
for x, y in zip(src_list, dest_list)
if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
]
if not len(pairs):
if using_cow:
Expand Down
9 changes: 0 additions & 9 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character(
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_dict_nested_gh4115(self):
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
Expand Down Expand Up @@ -556,7 +555,6 @@ def test_replace_series_dict(self):
result = df.replace(s, df.mean())
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_convert(self):
# gh 3907
df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
Expand Down Expand Up @@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self):
with pytest.raises(ValueError, match=msg):
df.replace(to_rep, values[1:])

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_input_formats_scalar(self):
df = DataFrame(
{"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
Expand Down Expand Up @@ -961,7 +958,6 @@ def test_replace_limit(self):
# TODO
pass

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_dict_no_regex(self):
answer = Series(
{
Expand All @@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self):
result = answer.replace(weights)
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_series_no_regex(self):
answer = Series(
{
Expand Down Expand Up @@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string):
expect = DataFrame({"a": ["Y", "N", "Y"]})
tm.assert_frame_equal(res, expect)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_period(self):
d = {
"fname": {
Expand Down Expand Up @@ -1141,7 +1135,6 @@ def test_replace_period(self):
result = df.replace(d)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_datetime(self):
d = {
"fname": {
Expand Down Expand Up @@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp):
result = df.replace(to_replace)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
@pytest.mark.parametrize(
"replacer",
[
Expand Down Expand Up @@ -1644,7 +1636,6 @@ def test_regex_replace_scalar(
expected.loc[expected["a"] == ".", "a"] = expected_replace_val
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
@pytest.mark.parametrize("regex", [False, True])
def test_replace_regex_dtype_frame(self, regex):
# GH-48644
Expand Down
18 changes: 5 additions & 13 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
with pytest.raises(TypeError, match="Invalid value"):
Index(obj, dtype=obj.dtype).where(~mask, val)
else:
res = Index(obj, dtype=obj.dtype).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)
res = Index(obj, dtype=obj.dtype).where(~mask, val)
expected_idx = Index(expected, dtype=expected.dtype)
tm.assert_index_equal(res, expected_idx)

def test_index_putmask(self, obj, key, expected, warn, val):
mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

if obj.dtype == "string" and not (isinstance(val, str) or isna(val)):
with pytest.raises(TypeError, match="Invalid value"):
Index(obj, dtype=obj.dtype).putmask(mask, val)
else:
res = Index(obj, dtype=obj.dtype).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))
res = Index(obj, dtype=obj.dtype).putmask(mask, val)
tm.assert_index_equal(res, Index(expected, dtype=expected.dtype))


@pytest.mark.parametrize(
Expand Down
16 changes: 6 additions & 10 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string):
ser = pd.Series(categorical)
msg = "Downcasting behavior in `replace`"
msg = "with CategoricalDtype is deprecated"
if using_infer_string:
with pytest.raises(TypeError, match="Invalid value"):
ser.replace({"A": 1, "B": 2})
return
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace({"A": 1, "B": 2})
expected = pd.Series(numeric).astype("category")
Expand Down Expand Up @@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex):
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("regex", [False, True])
def test_replace_regex_dtype_series_string(self, regex, using_infer_string):
if not using_infer_string:
# then this is object dtype which is already tested above
return
def test_replace_regex_dtype_series_string(self, regex):
series = pd.Series(["0"], dtype="str")
with pytest.raises(TypeError, match="Invalid value"):
series.replace(to_replace="0", value=1, regex=regex)
expected = pd.Series([1], dtype="int64")
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = series.replace(to_replace="0", value=1, regex=regex)
tm.assert_series_equal(result, expected)

def test_replace_different_int_types(self, any_int_numpy_dtype):
# GH#45311
Expand Down
Loading