From 69468cd2e591312fe49c0dca0a949e7ce19e8697 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 21 Mar 2025 11:53:14 -0700 Subject: [PATCH 01/11] updated indexing.py to allow iloc.__getitem__ --- pandas/core/indexing.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index bcb27d0320c91..7bfafb3e17536 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1582,11 +1582,7 @@ def _validate_key(self, key, axis: AxisInt) -> None: if com.is_bool_indexer(key): if hasattr(key, "index") and isinstance(key.index, Index): if key.index.inferred_type == "integer": - raise NotImplementedError( - "iLocation based boolean " - "indexing on an integer type " - "is not available" - ) + return raise ValueError( "iLocation based boolean indexing cannot use an indexable as a mask" ) From ded44cb8e6bdde2a6a12b0317f8cc2f46641c354 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 21 Mar 2025 12:07:47 -0700 Subject: [PATCH 02/11] Updated test_iloc_mask test --- pandas/tests/indexing/test_iloc.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2f6998a85c80b..c95d607bc3438 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -726,16 +726,17 @@ def test_iloc_setitem_with_scalar_index(self, indexer, value): @pytest.mark.filterwarnings("ignore::UserWarning") def test_iloc_mask(self): - # GH 3631, iloc with a mask (of a series) should raise + # GH 60994, iloc with a mask (of a series) should return accordingly df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) mask = df.a % 2 == 0 msg = "iLocation based boolean indexing cannot use an indexable as a mask" with pytest.raises(ValueError, match=msg): df.iloc[mask] + mask.index = range(len(mask)) - msg = "iLocation based boolean indexing on an integer type is not available" - with pytest.raises(NotImplementedError, match=msg): - df.iloc[mask] + result = df.iloc[mask] + expected = df.iloc[[0, 2, 4]] + tm.assert_frame_equal(result, expected) # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] @@ -753,18 +754,14 @@ def test_iloc_mask(self): (None, ".iloc"): "0b1100", ("index", ""): "0b11", ("index", ".loc"): "0b11", - ("index", ".iloc"): ( - "iLocation based boolean indexing cannot use an indexable as a mask" - ), + ("index", ".iloc"): "0b11", ("locs", ""): "Unalignable boolean Series provided as indexer " "(index of the boolean Series and of the indexed " "object do not match).", ("locs", ".loc"): "Unalignable boolean Series provided as indexer " "(index of the boolean Series and of the " "indexed object do not match).", - ("locs", ".iloc"): ( - "iLocation based boolean indexing on an integer type is not available" - ), + ("locs", ".iloc"): "0b1", } # UserWarnings from reindex of a boolean mask @@ -780,7 +777,10 @@ def test_iloc_mask(self): else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except (ValueError, IndexingError, NotImplementedError) as err: + except ( + ValueError, + IndexingError, + ) as err: answer = str(err) key = ( From b4d58e1b397b39ced71454fab59a4a7d0dc8ecc5 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 21 Mar 2025 12:42:21 -0700 Subject: [PATCH 03/11] bugfix test_iloc_mask test --- pandas/tests/indexing/test_iloc.py | 38 ++++++++++++++++++------------ 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c95d607bc3438..510ec8260786f 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -735,8 +735,9 @@ def test_iloc_mask(self): mask.index = range(len(mask)) result = df.iloc[mask] - expected = df.iloc[[0, 2, 4]] - tm.assert_frame_equal(result, expected) + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[mask] # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] @@ -754,21 +755,20 @@ def test_iloc_mask(self): (None, ".iloc"): "0b1100", ("index", ""): "0b11", ("index", ".loc"): "0b11", - ("index", ".iloc"): "0b11", - ("locs", ""): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the indexed " - "object do not match).", - ("locs", ".loc"): "Unalignable boolean Series provided as indexer " - "(index of the boolean Series and of the " - "indexed object do not match).", - ("locs", ".iloc"): "0b1", + ( + "index", + ".iloc", + ): "iLocation based boolean indexing cannot use an indexable as a mask", + ("locs", ""): "Unalignable boolean Series provided as indexer", + ("locs", ".loc"): "Unalignable boolean Series provided as indexer", + ("locs", ".iloc"): "Unalignable boolean Series provided as indexer", } # UserWarnings from reindex of a boolean mask for idx in [None, "index", "locs"]: mask = (df.nums > 2).values if idx: - mask_index = getattr(df, idx)[::-1] + mask_index = getattr(df, idx if idx == "index" else "locs")[::-1] mask = Series(mask, list(mask_index)) for method in ["", ".loc", ".iloc"]: try: @@ -787,11 +787,19 @@ def test_iloc_mask(self): idx, method, ) - r = expected.get(key) - if r != answer: - raise AssertionError( - f"[{key}] does not match [{answer}], received [{r}]" + expected_result = expected.get(key) + + # Fix the assertion to check for substring match + if ( + idx is None or (idx == "index" and method != ".iloc") + ) and "0b" in expected_result: + # For successful numeric results, exact match is needed + assert expected_result == answer, ( + f"[{key}] does not match [{answer}]" ) + else: + # For error messages, substring match is sufficient + assert expected_result in answer, f"[{key}] not found in [{answer}]" def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) From 326b91cbc89d05ad3b82202c1c62e4ef31ae8aac Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 21 Mar 2025 13:46:31 -0700 Subject: [PATCH 04/11] bugfix test_iloc_mask --- pandas/tests/indexing/test_iloc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 510ec8260786f..def550ff410a0 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -734,7 +734,6 @@ def test_iloc_mask(self): df.iloc[mask] mask.index = range(len(mask)) - result = df.iloc[mask] msg = "Unalignable boolean Series provided as indexer" with pytest.raises(IndexingError, match=msg): df.iloc[mask] From 2c8174c76aded2402535e1c714bcd84a7c3d34f0 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Fri, 21 Mar 2025 14:57:34 -0700 Subject: [PATCH 05/11] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b4aa6447c0a1b..e9458046f6cde 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -68,6 +68,7 @@ Other enhancements - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) +- :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) From 9345465f114bf9bbcfc5f1ff596fcbb7486b02da Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 29 Mar 2025 13:46:29 -0700 Subject: [PATCH 06/11] added test to test_iloc_mask --- pandas/tests/indexing/test_iloc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index def550ff410a0..e88eac79239b4 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -742,6 +742,9 @@ def test_iloc_mask(self): result = df.iloc[np.array([True] * len(mask), dtype=bool)] tm.assert_frame_equal(result, df) + result2 = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + tm.assert_frame_equal(result2, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"])) + # the possibilities locs = np.arange(4) nums = 2**locs From 7533f64dfad439b256714c56203c8031e96cc02f Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 29 Mar 2025 13:47:51 -0700 Subject: [PATCH 07/11] formatting --- pandas/tests/indexing/test_iloc.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index e88eac79239b4..0f0611d13243b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -770,7 +770,7 @@ def test_iloc_mask(self): for idx in [None, "index", "locs"]: mask = (df.nums > 2).values if idx: - mask_index = getattr(df, idx if idx == "index" else "locs")[::-1] + mask_index = getattr(df, idx)[::-1] mask = Series(mask, list(mask_index)) for method in ["", ".loc", ".iloc"]: try: @@ -779,10 +779,7 @@ def test_iloc_mask(self): else: accessor = df answer = str(bin(accessor[mask]["nums"].sum())) - except ( - ValueError, - IndexingError, - ) as err: + except (ValueError, IndexingError) as err: answer = str(err) key = ( From 35bf005cdef7aba53787899e163bbd3bcce81ff6 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 29 Mar 2025 13:54:07 -0700 Subject: [PATCH 08/11] precommit --- pandas/tests/indexing/test_iloc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 0f0611d13243b..7e4bf73de8b6b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -743,7 +743,9 @@ def test_iloc_mask(self): tm.assert_frame_equal(result, df) result2 = df.iloc[np.array([True, False, True, False, True], dtype=bool)] - tm.assert_frame_equal(result2, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"])) + tm.assert_frame_equal( + result2, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + ) # the possibilities locs = np.arange(4) From 6780260af737311b2047ac784618fa94a5fea48f Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 5 Apr 2025 02:06:52 -0700 Subject: [PATCH 09/11] added tests for series bool mask --- pandas/tests/indexing/test_iloc.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 7e4bf73de8b6b..fc057d3a23a90 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -742,11 +742,28 @@ def test_iloc_mask(self): result = df.iloc[np.array([True] * len(mask), dtype=bool)] tm.assert_frame_equal(result, df) - result2 = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] tm.assert_frame_equal( - result2, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + result, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) ) + # series (index does not match) + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[Series([True] * len(mask), dtype=bool)] + + df = DataFrame(list(range(5)), columns=["a"]) + + result = df.iloc[Series([True] * len(mask), dtype=bool)] + tm.assert_frame_equal(result, df) + + result = df.iloc[Series([True, False, True, False, True], dtype=bool)] + tm.assert_frame_equal( + result, DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) + ) + + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + # the possibilities locs = np.arange(4) nums = 2**locs From 1c92fc8786d46defd85fb3b781d1145066dd4df5 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Sat, 5 Apr 2025 14:25:40 -0700 Subject: [PATCH 10/11] precommit --- pandas/tests/indexing/test_iloc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index fc057d3a23a90..b4b5ce3a34def 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -758,9 +758,7 @@ def test_iloc_mask(self): tm.assert_frame_equal(result, df) result = df.iloc[Series([True, False, True, False, True], dtype=bool)] - tm.assert_frame_equal( - result, DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) - ) + tm.assert_frame_equal(result, DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4])) df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) From 2481063045ea384bafb5113f0123d642fb165377 Mon Sep 17 00:00:00 2001 From: arthurlw Date: Tue, 8 Apr 2025 18:12:06 -0700 Subject: [PATCH 11/11] reformatted tests --- pandas/tests/indexing/test_iloc.py | 46 +++++++++++++++++------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b4b5ce3a34def..3be69617cad43 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -742,26 +742,6 @@ def test_iloc_mask(self): result = df.iloc[np.array([True] * len(mask), dtype=bool)] tm.assert_frame_equal(result, df) - result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] - tm.assert_frame_equal( - result, DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) - ) - - # series (index does not match) - msg = "Unalignable boolean Series provided as indexer" - with pytest.raises(IndexingError, match=msg): - df.iloc[Series([True] * len(mask), dtype=bool)] - - df = DataFrame(list(range(5)), columns=["a"]) - - result = df.iloc[Series([True] * len(mask), dtype=bool)] - tm.assert_frame_equal(result, df) - - result = df.iloc[Series([True, False, True, False, True], dtype=bool)] - tm.assert_frame_equal(result, DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4])) - - df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) - # the possibilities locs = np.arange(4) nums = 2**locs @@ -817,6 +797,32 @@ def test_iloc_mask(self): # For error messages, substring match is sufficient assert expected_result in answer, f"[{key}] not found in [{answer}]" + def test_iloc_with_numpy_bool_array(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + result = df.iloc[np.array([True, False, True, False, True], dtype=bool)] + expected = DataFrame({"a": [0, 2, 4]}, index=["A", "C", "E"]) + tm.assert_frame_equal(result, expected) + + def test_iloc_series_mask_with_index_mismatch_raises(self): + df = DataFrame(list(range(5)), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 + msg = "Unalignable boolean Series provided as indexer" + with pytest.raises(IndexingError, match=msg): + df.iloc[Series([True] * len(mask), dtype=bool)] + + def test_iloc_series_mask_all_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True] * len(df), dtype=bool) + result = df.iloc[mask] + tm.assert_frame_equal(result, df) + + def test_iloc_series_mask_alternate_true(self): + df = DataFrame(list(range(5)), columns=["a"]) + mask = Series([True, False, True, False, True], dtype=bool) + result = df.iloc[mask] + expected = DataFrame({"a": [0, 2, 4]}, index=[0, 2, 4]) + tm.assert_frame_equal(result, expected) + def test_iloc_non_unique_indexing(self): # GH 4017, non-unique indexing (on the axis) df = DataFrame({"A": [0.1] * 3000, "B": [1] * 3000})