Skip to content

Commit

Permalink
Fallback to StringDtype(python) instead
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed Feb 23, 2025
1 parent d621cd0 commit e3e64f4
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 19 deletions.
66 changes: 49 additions & 17 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
)
from pandas._libs.lib import is_string_array
from pandas._libs.tslibs import timezones
from pandas.compat import HAS_PYARROW
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
from pandas.errors import (
Expand Down Expand Up @@ -376,6 +377,13 @@ def read_hdf(
object
The selected object. Return type depends on the object stored.
Notes
-----
When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
to UTF-8, the resulting dtype will be
``pd.StringDtype(storage="python", na_value=np.nan)``.
See Also
--------
DataFrame.to_hdf : Write a HDF file from a DataFrame.
Expand Down Expand Up @@ -2257,6 +2265,20 @@ def convert(
# making an Index instance could throw a number of different errors
try:
new_pd_index = factory(values, **kwargs)
except UnicodeEncodeError as err:
if (
errors == "surrogatepass"
and get_option("future.infer_string")
and str(err).endswith("surrogates not allowed")
and HAS_PYARROW
):
new_pd_index = factory(
values,
dtype=StringDtype(storage="python", na_value=np.nan),
**kwargs,
)
else:
raise
except ValueError:
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
Expand Down Expand Up @@ -3182,12 +3204,13 @@ def read_index_node(
self.errors == "surrogatepass"
and get_option("future.infer_string")
and str(err).endswith("surrogates not allowed")
and HAS_PYARROW
):
index = factory(
_unconvert_index(
data, kind, encoding=self.encoding, errors=self.errors
),
dtype="object",
dtype=StringDtype(storage="python", na_value=np.nan),
**kwargs,
)
else:
Expand Down Expand Up @@ -3332,11 +3355,16 @@ def read(
except UnicodeEncodeError as err:
if (
self.errors == "surrogatepass"
and using_string_dtype()
and get_option("future.infer_string")
and str(err).endswith("surrogates not allowed")
and HAS_PYARROW
):
result = Series(
values, index=index, name=self.name, copy=False, dtype="object"
values,
index=index,
name=self.name,
copy=False,
dtype=StringDtype(storage="python", na_value=np.nan),
)
else:
raise
Expand Down Expand Up @@ -4786,7 +4814,24 @@ def read(
values = values.reshape((1, values.shape[0]))

if isinstance(values, (np.ndarray, DatetimeArray)):
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
try:
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
except UnicodeEncodeError as err:
if (
self.errors == "surrogatepass"
and get_option("future.infer_string")
and str(err).endswith("surrogates not allowed")
and HAS_PYARROW
):
df = DataFrame(
values.T,
columns=cols_,
index=index_,
copy=False,
dtype=StringDtype(storage="python", na_value=np.nan),
)
else:
raise
elif isinstance(values, Index):
df = DataFrame(values, columns=cols_, index=index_)
else:
Expand All @@ -4796,23 +4841,10 @@ def read(
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)

# If str / string dtype is stored in meta, use that.
converted = False
for column in cols_:
dtype = getattr(self.table.attrs, f"{column}_meta", None)
if dtype in ["str", "string"]:
df[column] = df[column].astype(dtype)
converted = True
# Otherwise try inference.
if (
not converted
and using_string_dtype()
and isinstance(values, np.ndarray)
and is_string_array(
values,
skipna=True,
)
):
df = df.astype(StringDtype(na_value=np.nan))
frames.append(df)

if len(frames) == 1:
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,15 +383,23 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):


@pytest.mark.parametrize("format", ["fixed", "table"])
def test_to_hdf_errors(tmp_path, format, setup_path):
def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
data = ["\ud800foo"]
ser = Series(data, index=Index(data, dtype="object"), dtype="object")
path = tmp_path / setup_path
# GH 20835
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")

result = read_hdf(path, "table", errors="surrogatepass")
tm.assert_series_equal(result, ser)

if using_infer_string:
# https://github.com/pandas-dev/pandas/pull/60993
# Surrogates fallback to python storage.
dtype = pd.StringDtype(storage="python", na_value=np.nan)
else:
dtype = "object"
expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
tm.assert_series_equal(result, expected)


def test_create_table_index(setup_path):
Expand Down

0 comments on commit e3e64f4

Please sign in to comment.