pandas-dev · kastkeepitjumpinlikekangaroos · Oct 25, 2024 · Oct 25, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -134,6 +134,7 @@ MultiIndex
 I/O
 ^^^
 - :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
+- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`)
 -
 
 Period

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -17,6 +17,7 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.astype import astype_is_view
 from pandas.core.dtypes.cast import (
@@ -34,7 +35,10 @@
     is_object_dtype,
     is_scalar,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCSeries,
@@ -968,9 +972,31 @@ def convert(arr):
                     # i.e. maybe_convert_objects didn't convert
                     arr = maybe_infer_to_datetimelike(arr)
                     if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
-                        new_dtype = StringDtype()
-                        arr_cls = new_dtype.construct_array_type()
-                        arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                        # Addressing (#59242)
+                        # Byte data that could not be decoded into
+                        # a string would throw a UnicodeDecodeError exception
+
+                        # Try and greedily convert to string
+                        if dtype_backend == "pyarrow":
+                            pa = import_optional_dependency("pyarrow")
+                            try:
+                                str_dtype = ArrowDtype(pa.string())
+                                str_cls = str_dtype.construct_array_type()
+                                arr = str_cls._from_sequence(arr, dtype=str_dtype)
+                            except pa.lib.ArrowInvalid:
+                                # in this case convert to pyarrow binary
+                                bin_dtype = ArrowDtype(pa.binary())
+                                bin_cls = bin_dtype.construct_array_type()
+                                arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
+                        else:
+                            try:
+                                new_dtype = StringDtype()
+                                arr_cls = new_dtype.construct_array_type()
+                                arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                            except UnicodeDecodeError:
+                                # in this case do nothing
+                                pass
+
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":
                         arr = pd_array(arr, copy=False)

diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -4355,3 +4355,19 @@ def test_xsqlite_if_exists(sqlite_buildin):
         (5, "E"),
     ]
     drop_table(table_name, sqlite_buildin)
+
+
+@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default])
+def test_bytes_column(sqlite_buildin, dtype_backend):
-def test_bytes_column(sqlite_buildin, dtype_backend):
+def test_bytes_column(all_connectable, dtype_backend):
-def test_bytes_column(sqlite_buildin, dtype_backend):
+def test_bytes_column(all_connectable, dtype_backend):
+    pytest.importorskip("pyarrow")
+    """
+    Regression test for (#59242)
+    Bytes being returned in a column that could not be converted
+    to a string would raise a UnicodeDecodeError
+    when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable'
+    """
+    query = """
+    select cast(x'0123456789abcdef0123456789abcdef' as blob) a
+    """
+    df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend)
+    assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"