From 224486925e5fd278963164064cc7b6d559f586e1 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 00:35:38 -0400 Subject: [PATCH 1/6] Add fix for #59242 --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/internals/construction.py | 12 +++++++++++- pandas/tests/io/test_sql.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 01c2ed3821d7a..2880ecec81f66 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -130,7 +130,7 @@ MultiIndex I/O ^^^ -- +- Bug in :func:`read_sql` causing an unintended exception when byte data was being converted to string when using the pyarrow dtype_backend (:issue:`59242`) - Period diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 959e572b2b35b..1900ed282e876 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -970,7 +970,17 @@ def convert(arr): if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) + try: + # Addressing (#59242) + # Byte data that could not be decoded into + # a string would throw a UnicodeDecodeError exception + + # Try and greedily convert to string + # Will fail if the object is bytes + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + except UnicodeDecodeError: + pass + elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": arr = pd_array(arr, copy=False) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index c28a33069d23f..69ad44d1a5e73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4352,3 +4352,17 @@ def test_xsqlite_if_exists(sqlite_buildin): (5, "E"), ] drop_table(table_name, sqlite_buildin) + + +def test_bytes_column(sqlite_buildin): + """ + Regression test for (#59242) + Bytes being returned in a column that could not be converted + to a string would raise a UnicodeDecodeError + when using dtype_backend='pyarrow' + """ + query = """ + select cast(x'0123456789abcdef0123456789abcdef' as blob) a + """ + df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow") + assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" From bd00fc545e25a611d97edecb9aac8c0324d17e90 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Fri, 25 Oct 2024 18:09:53 -0400 Subject: [PATCH 2/6] add skip import --- pandas/tests/io/test_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 69ad44d1a5e73..73f9ff42287fc 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4355,6 +4355,7 @@ def test_xsqlite_if_exists(sqlite_buildin): def test_bytes_column(sqlite_buildin): + pytest.importorskip("pyarrow") """ Regression test for (#59242) Bytes being returned in a column that could not be converted From 6a23f05c282f5fb07f507e09ac2667026d258527 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Tue, 12 Nov 2024 17:40:39 -0500 Subject: [PATCH 3/6] address comment --- pandas/core/internals/construction.py | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1900ed282e876..3f293281a4b53 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -17,6 +17,7 @@ from pandas._config import using_string_dtype from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( @@ -34,7 +35,10 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + ExtensionDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -968,18 +972,27 @@ def convert(arr): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): - new_dtype = StringDtype() - arr_cls = new_dtype.construct_array_type() - try: + if dtype_backend == "pyarrow": + pa = import_optional_dependency("pyarrow") # Addressing (#59242) # Byte data that could not be decoded into # a string would throw a UnicodeDecodeError exception - # Try and greedily convert to string - # Will fail if the object is bytes + # Try and greedily convert to pyarrow string + # Will fail if the object is bytes: + # in this case convert to pyarrow binary + try: + str_dtype = ArrowDtype(pa.string()) + str_cls = str_dtype.construct_array_type() + arr = str_cls._from_sequence(arr, dtype=str_dtype) + except pa.lib.ArrowInvalid: + bin_dtype = ArrowDtype(pa.binary()) + bin_cls = bin_dtype.construct_array_type() + arr = bin_cls._from_sequence(arr, dtype=bin_dtype) + else: + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) - except UnicodeDecodeError: - pass elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": From 8f900b833cd3a495a9246e888e1b2572faf583aa Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 13 Nov 2024 17:32:10 -0500 Subject: [PATCH 4/6] also fix for dtype_backend=numpy_nullable --- pandas/core/internals/construction.py | 23 +++++++++++++---------- pandas/tests/io/test_sql.py | 7 ++++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1828913da65ab..245aa3291fe2f 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -972,27 +972,30 @@ def convert(arr): # i.e. maybe_convert_objects didn't convert arr = maybe_infer_to_datetimelike(arr) if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + # Addressing (#59242) + # Byte data that could not be decoded into + # a string would throw a UnicodeDecodeError exception + + # Try and greedily convert to string if dtype_backend == "pyarrow": pa = import_optional_dependency("pyarrow") - # Addressing (#59242) - # Byte data that could not be decoded into - # a string would throw a UnicodeDecodeError exception - - # Try and greedily convert to pyarrow string - # Will fail if the object is bytes: - # in this case convert to pyarrow binary try: str_dtype = ArrowDtype(pa.string()) str_cls = str_dtype.construct_array_type() arr = str_cls._from_sequence(arr, dtype=str_dtype) except pa.lib.ArrowInvalid: + # in this case convert to pyarrow binary bin_dtype = ArrowDtype(pa.binary()) bin_cls = bin_dtype.construct_array_type() arr = bin_cls._from_sequence(arr, dtype=bin_dtype) else: - new_dtype = StringDtype() - arr_cls = new_dtype.construct_array_type() - arr = arr_cls._from_sequence(arr, dtype=new_dtype) + try: + new_dtype = StringDtype() + arr_cls = new_dtype.construct_array_type() + arr = arr_cls._from_sequence(arr, dtype=new_dtype) + except UnicodeDecodeError: + # in this case do nothing + pass elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): if arr.dtype.kind in "iufb": diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1976973b28f74..f94ac59f6f7b6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4357,16 +4357,17 @@ def test_xsqlite_if_exists(sqlite_buildin): drop_table(table_name, sqlite_buildin) -def test_bytes_column(sqlite_buildin): +@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default]) +def test_bytes_column(sqlite_buildin, dtype_backend): pytest.importorskip("pyarrow") """ Regression test for (#59242) Bytes being returned in a column that could not be converted to a string would raise a UnicodeDecodeError - when using dtype_backend='pyarrow' + when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable' """ query = """ select cast(x'0123456789abcdef0123456789abcdef' as blob) a """ - df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow") + df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend) assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" From a32b4a6c9f0f412dbc669ba04e23a34880862446 Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Sun, 17 Nov 2024 09:15:30 -0500 Subject: [PATCH 5/6] fix --- pandas/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 08bd1117c456a..6c97baa890777 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -191,7 +191,6 @@ # module level doc-string -__version__ = "3.0.0" __doc__ = """ pandas - a powerful data analysis and manipulation library for Python ===================================================================== From a0200d0411d34fb0599b4bd7788e5e8b937b47fc Mon Sep 17 00:00:00 2001 From: Owen Christie Date: Wed, 20 Nov 2024 18:48:32 -0500 Subject: [PATCH 6/6] address comment --- pandas/tests/io/test_sql.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 49f4dab138843..1292cf52f42ff 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -4373,6 +4373,12 @@ def test_bytes_column(sqlite_buildin, dtype_backend): select cast(x'0123456789abcdef0123456789abcdef' as blob) a """ df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend) - assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef" - if dtype_backend == "pyarrow": - assert df.a.dtype == pd.ArrowDtype(pa.binary()) + expected = DataFrame( + [ + { + "a": b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef", + } + ], + dtype=(pd.ArrowDtype(pa.binary()) if dtype_backend == "pyarrow" else "O"), + ) + tm.assert_frame_equal(df, expected)