From d834e46d832191093a65fd2276fb6b4aa8141085 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 15:12:07 +0100 Subject: [PATCH 1/3] REF: centralize pyarrow Table to pandas conversions and types_mapper handling --- pandas/io/_util.py | 34 ++++++++++++++++++- pandas/io/feather_format.py | 17 ++-------- pandas/io/json/_json.py | 15 ++------- pandas/io/orc.py | 21 ++---------- pandas/io/parquet.py | 18 ++-------- pandas/io/parsers/arrow_parser_wrapper.py | 27 ++++----------- pandas/io/sql.py | 41 ++++------------------- 7 files changed, 54 insertions(+), 119 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 9a8c87a738d4c..6f398362f3346 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -4,6 +4,9 @@ import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency @@ -12,6 +15,8 @@ if TYPE_CHECKING: from collections.abc import Callable + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -33,7 +38,7 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") mapping = { @@ -44,3 +49,30 @@ def arrow_string_types_mapper() -> Callable: mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) return mapping.get + + +def arrow_table_to_pandas( + table, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, +) -> pd.DataFrame: + pa = import_optional_dependency("pyarrow") + + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + types_mapper = _arrow_string_types_mapper() + elif dtype_backend is lib.no_default: + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper) + return df diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index aaae9857b4fae..7b4c81853eba3 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,11 +15,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -147,16 +146,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e9c9f5ba225a5..983780f81043f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -36,7 +36,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -48,6 +47,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -940,18 +940,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f179dafc919e5..a945f3dc38d35 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -127,21 +124,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 24415299e799b..116f228faca93 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -15,22 +15,19 @@ filterwarnings, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -249,17 +246,6 @@ def read( ) -> DataFrame: kwargs["use_pandas_metadata"] = True - to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() - path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -280,7 +266,7 @@ def read( "make_block is deprecated", DeprecationWarning, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) if pa_table.schema.metadata: if b"PANDAS_ATTRS" in pa_table.schema.metadata: diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 86bb5f190e403..672672490996d 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -293,17 +287,8 @@ def read(self) -> DataFrame: "make_block is deprecated", DeprecationWarning, ) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 125ca51a456d8..3c0c5cc64c24c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -48,10 +48,7 @@ is_object_dtype, is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -67,6 +64,8 @@ from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Callable, @@ -2208,23 +2207,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2292,23 +2278,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, From 7996aa331f4af7224ba88bd729c8a0d8cf3518ab Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 16:05:13 +0100 Subject: [PATCH 2/3] fix typing and sql default backend case --- pandas/io/_util.py | 1 + pandas/io/sql.py | 6 ++++++ pandas/tests/io/test_sql.py | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 6f398362f3346..ff924887ade74 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -58,6 +58,7 @@ def arrow_table_to_pandas( ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") + types_mapper: type[pd.ArrowDtype] | None | Callable if dtype_backend == "numpy_nullable": mapping = _arrow_dtype_mapping() if null_to_int64: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3c0c5cc64c24c..d4effdfdce823 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2210,6 +2210,9 @@ def read_table( with self.con.cursor() as cur: cur.execute(stmt) pa_table = cur.fetch_arrow_table() + dtype_backend = ( + lib.no_default if dtype_backend == "numpy" else dtype_backend + ) df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( @@ -2281,6 +2284,9 @@ def read_query( with self.con.cursor() as cur: cur.execute(sql) pa_table = cur.fetch_arrow_table() + dtype_backend = ( + lib.no_default if dtype_backend == "numpy" else dtype_backend + ) df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 96d63d3fe25e5..7e1220ecee218 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -959,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] From a6553473cbd7891ebd5e7943dc0a0fa29d1c61e9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 15 Nov 2024 16:53:54 +0100 Subject: [PATCH 3/3] try fix typing --- pandas/io/_util.py | 13 +++++++++---- pandas/io/sql.py | 6 ------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index ff924887ade74..21203ad036fc6 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np @@ -15,6 +18,8 @@ if TYPE_CHECKING: from collections.abc import Callable + import pyarrow + from pandas._typing import DtypeBackend @@ -52,8 +57,8 @@ def _arrow_string_types_mapper() -> Callable: def arrow_table_to_pandas( - table, - dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") @@ -70,7 +75,7 @@ def arrow_table_to_pandas( types_mapper = pd.ArrowDtype elif using_string_dtype(): types_mapper = _arrow_string_types_mapper() - elif dtype_backend is lib.no_default: + elif dtype_backend is lib.no_default or dtype_backend == "numpy": types_mapper = None else: raise NotImplementedError diff --git a/pandas/io/sql.py b/pandas/io/sql.py index d4effdfdce823..3c0c5cc64c24c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2210,9 +2210,6 @@ def read_table( with self.con.cursor() as cur: cur.execute(stmt) pa_table = cur.fetch_arrow_table() - dtype_backend = ( - lib.no_default if dtype_backend == "numpy" else dtype_backend - ) df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( @@ -2284,9 +2281,6 @@ def read_query( with self.con.cursor() as cur: cur.execute(sql) pa_table = cur.fetch_arrow_table() - dtype_backend = ( - lib.no_default if dtype_backend == "numpy" else dtype_backend - ) df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc(