Fallback to StringDtype(python) instead

rhshadrach · Feb 23, 2025 · e3e64f4 · e3e64f4
1 parent d621cd0
commit e3e64f4
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 19 deletions.
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -39,6 +39,7 @@
 )
 from pandas._libs.lib import is_string_array
 from pandas._libs.tslibs import timezones
+from pandas.compat import HAS_PYARROW
 from pandas.compat._optional import import_optional_dependency
 from pandas.compat.pickle_compat import patch_pickle
 from pandas.errors import (
@@ -376,6 +377,13 @@ def read_hdf(
     object
         The selected object. Return type depends on the object stored.
 
+    Notes
+    -----
+    When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
+    and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
+    to UTF-8, the resulting dtype will be
+    ``pd.StringDtype(storage="python", na_value=np.nan)``.
+
     See Also
     --------
     DataFrame.to_hdf : Write a HDF file from a DataFrame.
@@ -2257,6 +2265,20 @@ def convert(
         # making an Index instance could throw a number of different errors
         try:
             new_pd_index = factory(values, **kwargs)
+        except UnicodeEncodeError as err:
+            if (
+                errors == "surrogatepass"
+                and get_option("future.infer_string")
+                and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
+            ):
+                new_pd_index = factory(
+                    values,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
+                    **kwargs,
+                )
+            else:
+                raise
         except ValueError:
             # if the output freq is different that what we recorded,
             # it should be None (see also 'doc example part 2')
@@ -3182,12 +3204,13 @@ def read_index_node(
                     self.errors == "surrogatepass"
                     and get_option("future.infer_string")
                     and str(err).endswith("surrogates not allowed")
+                    and HAS_PYARROW
                 ):
                     index = factory(
                         _unconvert_index(
                             data, kind, encoding=self.encoding, errors=self.errors
                         ),
-                        dtype="object",
+                        dtype=StringDtype(storage="python", na_value=np.nan),
                         **kwargs,
                     )
                 else:
@@ -3332,11 +3355,16 @@ def read(
         except UnicodeEncodeError as err:
             if (
                 self.errors == "surrogatepass"
-                and using_string_dtype()
+                and get_option("future.infer_string")
                 and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
             ):
                 result = Series(
-                    values, index=index, name=self.name, copy=False, dtype="object"
+                    values,
+                    index=index,
+                    name=self.name,
+                    copy=False,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
                 )
             else:
                 raise
@@ -4786,7 +4814,24 @@ def read(
                 values = values.reshape((1, values.shape[0]))
 
             if isinstance(values, (np.ndarray, DatetimeArray)):
-                df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+                try:
+                    df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+                except UnicodeEncodeError as err:
+                    if (
+                        self.errors == "surrogatepass"
+                        and get_option("future.infer_string")
+                        and str(err).endswith("surrogates not allowed")
+                        and HAS_PYARROW
+                    ):
+                        df = DataFrame(
+                            values.T,
+                            columns=cols_,
+                            index=index_,
+                            copy=False,
+                            dtype=StringDtype(storage="python", na_value=np.nan),
+                        )
+                    else:
+                        raise
             elif isinstance(values, Index):
                 df = DataFrame(values, columns=cols_, index=index_)
             else:
@@ -4796,23 +4841,10 @@ def read(
                 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
 
             # If str / string dtype is stored in meta, use that.
-            converted = False
             for column in cols_:
                 dtype = getattr(self.table.attrs, f"{column}_meta", None)
                 if dtype in ["str", "string"]:
                     df[column] = df[column].astype(dtype)
-                    converted = True
-            # Otherwise try inference.
-            if (
-                not converted
-                and using_string_dtype()
-                and isinstance(values, np.ndarray)
-                and is_string_array(
-                    values,
-                    skipna=True,
-                )
-            ):
-                df = df.astype(StringDtype(na_value=np.nan))
             frames.append(df)
 
         if len(frames) == 1:

diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
@@ -383,15 +383,23 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
 
 
 @pytest.mark.parametrize("format", ["fixed", "table"])
-def test_to_hdf_errors(tmp_path, format, setup_path):
+def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
     data = ["\ud800foo"]
     ser = Series(data, index=Index(data, dtype="object"), dtype="object")
     path = tmp_path / setup_path
     # GH 20835
     ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
 
     result = read_hdf(path, "table", errors="surrogatepass")
-    tm.assert_series_equal(result, ser)
+
+    if using_infer_string:
+        # https://github.com/pandas-dev/pandas/pull/60993
+        # Surrogates fallback to python storage.
+        dtype = pd.StringDtype(storage="python", na_value=np.nan)
+    else:
+        dtype = "object"
+    expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
+    tm.assert_series_equal(result, expected)
 
 
 def test_create_table_index(setup_path):