From 5bd593147e7ecc5707644fd11fa1a74dfc4c1ecb Mon Sep 17 00:00:00 2001 From: rohanjain101 <38412262+rohanjain101@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:08:21 -0500 Subject: [PATCH] Backport PR #56677: Fix integral truediv and floordiv for pyarrow types with large divisor and avoid floating points for floordiv --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/arrays/arrow/array.py | 46 +++++++++++++----- pandas/tests/extension/test_arrow.py | 71 +++++++++++++++++++++++++++- 3 files changed, 104 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4222de8ce324f..0b04a1d313a6d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -786,6 +786,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`read_csv` with ``engine="pyarrow"`` causing rounding errors for large integers (:issue:`52505`) +- Bug in :meth:`Series.__floordiv__` and :meth:`Series.__truediv__` for :class:`ArrowDtype` with integral dtypes raising for large divisors (:issue:`56706`) - Bug in :meth:`Series.__floordiv__` for :class:`ArrowDtype` with integral dtypes raising for large values (:issue:`56645`) - Bug in :meth:`Series.pow` not filling missing values correctly (:issue:`55512`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0bc01d2da330a..3858ce4cf0ea1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -109,30 +109,50 @@ def cast_for_truediv( arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar - ) -> pa.ChunkedArray: + ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: # Ensure int / int -> float mirroring Python/Numpy behavior # as pc.divide_checked(int, int) -> int if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( pa_object.type ): + # GH: 56645. # https://github.com/apache/arrow/issues/35563 - # Arrow does not allow safe casting large integral values to float64. - # Intentionally not using arrow_array.cast because it could be a scalar - # value in reflected case, and safe=False only added to - # scalar cast in pyarrow 13. - return pc.cast(arrow_array, pa.float64(), safe=False) - return arrow_array + return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( + pa_object, pa.float64(), safe=False + ) + + return arrow_array, pa_object def floordiv_compat( left: pa.ChunkedArray | pa.Array | pa.Scalar, right: pa.ChunkedArray | pa.Array | pa.Scalar, ) -> pa.ChunkedArray: - # Ensure int // int -> int mirroring Python/Numpy behavior - # as pc.floor(pc.divide_checked(int, int)) -> float - converted_left = cast_for_truediv(left, right) - result = pc.floor(pc.divide(converted_left, right)) + # TODO: Replace with pyarrow floordiv kernel. + # https://github.com/apache/arrow/issues/39386 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): + divided = pc.divide_checked(left, right) + if pa.types.is_signed_integer(divided.type): + # GH 56676 + has_remainder = pc.not_equal(pc.multiply(divided, right), left) + has_one_negative_operand = pc.less( + pc.bit_wise_xor(left, right), + pa.scalar(0, type=divided.type), + ) + result = pc.if_else( + pc.and_( + has_remainder, + has_one_negative_operand, + ), + # GH: 55561 + pc.subtract(divided, pa.scalar(1, type=divided.type)), + divided, + ) + else: + result = divided result = result.cast(left.type) + else: + divided = pc.divide(left, right) + result = pc.floor(divided) return result ARROW_ARITHMETIC_FUNCS = { @@ -142,8 +162,8 @@ def floordiv_compat( "rsub": lambda x, y: pc.subtract_checked(y, x), "mul": pc.multiply_checked, "rmul": lambda x, y: pc.multiply_checked(y, x), - "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), - "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), + "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), + "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), "floordiv": lambda x, y: floordiv_compat(x, y), "rfloordiv": lambda x, y: floordiv_compat(y, x), "mod": NotImplemented, diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 6689fb34f2ae3..05a112e464677 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3260,13 +3260,82 @@ def test_arrow_floordiv(): def test_arrow_floordiv_large_values(): - # GH 55561 + # GH 56645 a = pd.Series([1425801600000000000], dtype="int64[pyarrow]") expected = pd.Series([1425801600000], dtype="int64[pyarrow]") result = a // 1_000_000 tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floordiv_large_integral_result(dtype): + # GH 56676 + a = pd.Series([18014398509481983], dtype=dtype) + result = a // 1 + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_larger_divisor(pa_type): + # GH 56676 + dtype = ArrowDtype(pa_type) + a = pd.Series([-23], dtype=dtype) + result = a // 24 + expected = pd.Series([-1], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES) +def test_arrow_floordiv_integral_invalid(pa_type): + # GH 56676 + min_value = np.iinfo(pa_type.to_pandas_dtype()).min + a = pd.Series([min_value], dtype=ArrowDtype(pa_type)) + with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"): + a // -1 + with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"): + a // 0 + + +@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR) +def test_arrow_floordiv_floating_0_divisor(dtype): + # GH 56676 + a = pd.Series([2], dtype=dtype) + result = a // 0 + expected = pd.Series([float("inf")], dtype=dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) +def test_arrow_integral_floordiv_large_values(pa_type): + # GH 56676 + max_value = np.iinfo(pa_type.to_pandas_dtype()).max + dtype = ArrowDtype(pa_type) + a = pd.Series([max_value], dtype=dtype) + b = pd.Series([1], dtype=dtype) + result = a // b + tm.assert_series_equal(result, a) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_true_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype="float64[pyarrow]") + result = a / b + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"]) +def test_arrow_floor_division_large_divisor(dtype): + # GH 56706 + a = pd.Series([0], dtype=dtype) + b = pd.Series([18014398509481983], dtype=dtype) + expected = pd.Series([0], dtype=dtype) + result = a // b + tm.assert_series_equal(result, expected) + + def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]