From 224486925e5fd278963164064cc7b6d559f586e1 Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Fri, 25 Oct 2024 00:35:38 -0400
Subject: [PATCH 1/6] Add fix for #59242

---
 doc/source/whatsnew/v2.3.0.rst        |  2 +-
 pandas/core/internals/construction.py | 12 +++++++++++-
 pandas/tests/io/test_sql.py           | 14 ++++++++++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 01c2ed3821d7a..2880ecec81f66 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -130,7 +130,7 @@ MultiIndex
 
 I/O
 ^^^
--
+- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`)
 -
 
 Period
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 959e572b2b35b..1900ed282e876 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -970,7 +970,17 @@ def convert(arr):
                     if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
                         new_dtype = StringDtype()
                         arr_cls = new_dtype.construct_array_type()
-                        arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                        try:
+                            # Addressing (#59242)
+                            # Byte data that could not be decoded into
+                            # a string would throw a UnicodeDecodeError exception
+
+                            # Try and greedily convert to string
+                            # Will fail if the object is bytes
+                            arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                        except UnicodeDecodeError:
+                            pass
+
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":
                         arr = pd_array(arr, copy=False)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index c28a33069d23f..69ad44d1a5e73 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -4352,3 +4352,17 @@ def test_xsqlite_if_exists(sqlite_buildin):
         (5, "E"),
     ]
     drop_table(table_name, sqlite_buildin)
+
+
+def test_bytes_column(sqlite_buildin):
+    """
+    Regression test for (#59242)
+    Bytes being returned in a column that could not be converted
+    to a string would raise a UnicodeDecodeError
+    when using dtype_backend='pyarrow'
+    """
+    query = """
+    select cast(x'0123456789abcdef0123456789abcdef' as blob) a
+    """
+    df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow")
+    assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"

From bd00fc545e25a611d97edecb9aac8c0324d17e90 Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Fri, 25 Oct 2024 18:09:53 -0400
Subject: [PATCH 2/6] add skip import

---
 pandas/tests/io/test_sql.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 69ad44d1a5e73..73f9ff42287fc 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -4355,6 +4355,7 @@ def test_xsqlite_if_exists(sqlite_buildin):
 
 
 def test_bytes_column(sqlite_buildin):
+    pytest.importorskip("pyarrow")
     """
     Regression test for (#59242)
     Bytes being returned in a column that could not be converted

From 6a23f05c282f5fb07f507e09ac2667026d258527 Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Tue, 12 Nov 2024 17:40:39 -0500
Subject: [PATCH 3/6] address comment

---
 pandas/core/internals/construction.py | 29 +++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 1900ed282e876..3f293281a4b53 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -17,6 +17,7 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.astype import astype_is_view
 from pandas.core.dtypes.cast import (
@@ -34,7 +35,10 @@
     is_object_dtype,
     is_scalar,
 )
-from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    ExtensionDtype,
+)
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCSeries,
@@ -968,18 +972,27 @@ def convert(arr):
                     # i.e. maybe_convert_objects didn't convert
                     arr = maybe_infer_to_datetimelike(arr)
                     if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
-                        new_dtype = StringDtype()
-                        arr_cls = new_dtype.construct_array_type()
-                        try:
+                        if dtype_backend == "pyarrow":
+                            pa = import_optional_dependency("pyarrow")
                             # Addressing (#59242)
                             # Byte data that could not be decoded into
                             # a string would throw a UnicodeDecodeError exception
 
-                            # Try and greedily convert to string
-                            # Will fail if the object is bytes
+                            # Try and greedily convert to pyarrow string
+                            # Will fail if the object is bytes:
+                            # in this case convert to pyarrow binary
+                            try:
+                                str_dtype = ArrowDtype(pa.string())
+                                str_cls = str_dtype.construct_array_type()
+                                arr = str_cls._from_sequence(arr, dtype=str_dtype)
+                            except pa.lib.ArrowInvalid:
+                                bin_dtype = ArrowDtype(pa.binary())
+                                bin_cls = bin_dtype.construct_array_type()
+                                arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
+                        else:
+                            new_dtype = StringDtype()
+                            arr_cls = new_dtype.construct_array_type()
                             arr = arr_cls._from_sequence(arr, dtype=new_dtype)
-                        except UnicodeDecodeError:
-                            pass
 
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":

From 8f900b833cd3a495a9246e888e1b2572faf583aa Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Wed, 13 Nov 2024 17:32:10 -0500
Subject: [PATCH 4/6] also fix for dtype_backend=numpy_nullable

---
 pandas/core/internals/construction.py | 23 +++++++++++++----------
 pandas/tests/io/test_sql.py           |  7 ++++---
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 1828913da65ab..245aa3291fe2f 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -972,27 +972,30 @@ def convert(arr):
                     # i.e. maybe_convert_objects didn't convert
                     arr = maybe_infer_to_datetimelike(arr)
                     if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
+                        # Addressing (#59242)
+                        # Byte data that could not be decoded into
+                        # a string would throw a UnicodeDecodeError exception
+
+                        # Try and greedily convert to string
                         if dtype_backend == "pyarrow":
                             pa = import_optional_dependency("pyarrow")
-                            # Addressing (#59242)
-                            # Byte data that could not be decoded into
-                            # a string would throw a UnicodeDecodeError exception
-
-                            # Try and greedily convert to pyarrow string
-                            # Will fail if the object is bytes:
-                            # in this case convert to pyarrow binary
                             try:
                                 str_dtype = ArrowDtype(pa.string())
                                 str_cls = str_dtype.construct_array_type()
                                 arr = str_cls._from_sequence(arr, dtype=str_dtype)
                             except pa.lib.ArrowInvalid:
+                                # in this case convert to pyarrow binary
                                 bin_dtype = ArrowDtype(pa.binary())
                                 bin_cls = bin_dtype.construct_array_type()
                                 arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
                         else:
-                            new_dtype = StringDtype()
-                            arr_cls = new_dtype.construct_array_type()
-                            arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                            try:
+                                new_dtype = StringDtype()
+                                arr_cls = new_dtype.construct_array_type()
+                                arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                            except UnicodeDecodeError:
+                                # in this case do nothing
+                                pass
 
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 1976973b28f74..f94ac59f6f7b6 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -4357,16 +4357,17 @@ def test_xsqlite_if_exists(sqlite_buildin):
     drop_table(table_name, sqlite_buildin)
 
 
-def test_bytes_column(sqlite_buildin):
+@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default])
+def test_bytes_column(sqlite_buildin, dtype_backend):
     pytest.importorskip("pyarrow")
     """
     Regression test for (#59242)
     Bytes being returned in a column that could not be converted
     to a string would raise a UnicodeDecodeError
-    when using dtype_backend='pyarrow'
+    when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable'
     """
     query = """
     select cast(x'0123456789abcdef0123456789abcdef' as blob) a
     """
-    df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow")
+    df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend)
     assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"

From a32b4a6c9f0f412dbc669ba04e23a34880862446 Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Sun, 17 Nov 2024 09:15:30 -0500
Subject: [PATCH 5/6] fix

---
 pandas/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/__init__.py b/pandas/__init__.py
index 08bd1117c456a..6c97baa890777 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -191,7 +191,6 @@
 
 
 # module level doc-string
-__version__ = "3.0.0"
 __doc__ = """
 pandas - a powerful data analysis and manipulation library for Python
 =====================================================================

From a0200d0411d34fb0599b4bd7788e5e8b937b47fc Mon Sep 17 00:00:00 2001
From: Owen Christie <owendot2011@gmail.com>
Date: Wed, 20 Nov 2024 18:48:32 -0500
Subject: [PATCH 6/6] address comment

---
 pandas/tests/io/test_sql.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 49f4dab138843..1292cf52f42ff 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -4373,6 +4373,12 @@ def test_bytes_column(sqlite_buildin, dtype_backend):
     select cast(x'0123456789abcdef0123456789abcdef' as blob) a
     """
     df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend)
-    assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"
-    if dtype_backend == "pyarrow":
-        assert df.a.dtype == pd.ArrowDtype(pa.binary())
+    expected = DataFrame(
+        [
+            {
+                "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef",
+            }
+        ],
+        dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"),
+    )
+    tm.assert_frame_equal(df, expected)