From a3f14bfa373c6fb4e3470a1f3bd8fed1657e09e1 Mon Sep 17 00:00:00 2001 From: Abhishek Chaudhari <91185083+AbhishekChaudharii@users.noreply.github.com> Date: Fri, 1 Nov 2024 22:26:00 +0530 Subject: [PATCH] BUG: Fixes pd.merge issue with columns of dtype numpy.uintc on windows (#60145) * bug fix for numpy.uintc in merge operations on windows Added pytest test case to verify correct behavior with numpy.uintc dtype * Formatting changes after running pre-commit * Added tests for numpy.intc * added whatsnew note * pre-commit automatic changes and also made changes to test_merge.py file to make pandas namespace consistent * removed comment * added the deleted whatsnew note back * better whatsnew note Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/merge.py | 12 +++++++- pandas/tests/reshape/merge/test_merge.py | 35 ++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c61b8f3fb3701..2e64c66812306 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -739,6 +739,7 @@ Reshaping - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`) - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 07e8fa4841c04..0ca8661ad3b5c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -123,7 +123,17 @@ # See https://github.com/pandas-dev/pandas/issues/52451 if np.intc is not np.int32: - _factorizers[np.intc] = libhashtable.Int64Factorizer + if np.dtype(np.intc).itemsize == 4: + _factorizers[np.intc] = libhashtable.Int32Factorizer + else: + _factorizers[np.intc] = libhashtable.Int64Factorizer + +if np.uintc is not np.uint32: + if np.dtype(np.uintc).itemsize == 4: + _factorizers[np.uintc] = libhashtable.UInt32Factorizer + else: + _factorizers[np.uintc] = libhashtable.UInt64Factorizer + _known = (np.ndarray, ExtensionArray, Index, ABCSeries) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d4766242b8460..f0abc1afc6ab0 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1843,6 +1843,41 @@ def test_merge_empty(self, left_empty, how, exp): tm.assert_frame_equal(result, expected) + def test_merge_with_uintc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.uintc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_with_intc_columns(self): + df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)}) + df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)}) + result = df1.merge(df2, how="outer") + expected = DataFrame( + { + "a": ["bar", "baz", "foo", "foo"], + "b": np.array([2, 4, 1, 3], dtype=np.intc), + } + ) + tm.assert_frame_equal(result.reset_index(drop=True), expected) + + def test_merge_intc_non_monotonic(self): + df = DataFrame({"join_key": Series([0, 2, 1], dtype=np.intc)}) + df_details = DataFrame( + {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]} + ) + merged = df.merge(df_details, on="join_key", how="left") + expected = DataFrame( + {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]} + ) + tm.assert_frame_equal(merged.reset_index(drop=True), expected) + @pytest.fixture def left():