Skip to content

Commit

Permalink
[backport 2.3.x] TST (string dtype): resolve all easy xfails in panda…
Browse files Browse the repository at this point in the history
…s/tests/groupby (#60314) (#60317)

TST (string dtype): resolve all easy xfails in pandas/tests/groupby (#60314)

(cherry picked from commit c4a2026)
  • Loading branch information
jorisvandenbossche authored Nov 14, 2024
1 parent c875a53 commit 9537b20
Show file tree
Hide file tree
Showing 13 changed files with 30 additions and 53 deletions.
8 changes: 2 additions & 6 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import SpecificationError

from pandas.core.dtypes.common import is_integer_dtype
Expand Down Expand Up @@ -335,12 +333,11 @@ def aggfun_1(ser):
assert len(result) == 0


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_wrap_agg_out(three_group):
grouped = three_group.groupby(["A", "B"])

def func(ser):
if ser.dtype == object:
if ser.dtype in (object, "string"):
raise TypeError("Test error message")
return ser.sum()

Expand Down Expand Up @@ -1101,7 +1098,6 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_aggregate_mixed_types():
# GH 16916
df = DataFrame(
Expand All @@ -1113,7 +1109,7 @@ def test_aggregate_mixed_types():
expected = DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
columns=Index(["X", "Y", "Z"]),
)
tm.assert_frame_equal(result, expected)

Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -95,7 +93,6 @@ def test_cython_agg_boolean():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_cython_agg_nothing_to_agg():
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
Expand All @@ -111,7 +108,9 @@ def test_cython_agg_nothing_to_agg():

result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
[],
index=frame["a"].sort_values().drop_duplicates(),
columns=Index([], dtype="str"),
)
tm.assert_frame_equal(result, expected)

Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import SpecificationError

import pandas as pd
Expand Down Expand Up @@ -308,7 +306,6 @@ def test_series_agg_multikey():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_series_agg_multi_pure_python():
data = DataFrame(
{
Expand Down Expand Up @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python():
)

def bad(x):
assert len(x.values.base) > 0
if isinstance(x.values, np.ndarray):
assert len(x.values.base) > 0
return "foo"

result = data.groupby(["A", "B"]).agg(bad)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -170,11 +168,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])

msg = "dtype 'object' does not support operation 'quantile'"
msg = "dtype '(object|str)' does not support operation 'quantile'"
with pytest.raises(TypeError, match=msg):
df.groupby("key").quantile()

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/methods/test_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def test_size_series_masked_type_returns_Int64(dtype):
tm.assert_series_equal(result, expected)


# TODO(infer_string) in case the column is object dtype, it should preserve that dtype
# for the result's index
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_size_strings(any_string_dtype):
# GH#55627
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -340,15 +338,18 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_observed(observed):
def test_observed(request, using_infer_string, observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869

if using_infer_string and not observed:
# TODO(infer_string) this fails with filling the string column with 0
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,7 +1617,6 @@ def test_groupby_two_group_keys_all_nan():
assert result == {}


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_2d_malformed():
d = DataFrame(index=range(2))
d["group"] = ["g1", "g2"]
Expand All @@ -1626,7 +1625,7 @@ def test_groupby_2d_malformed():
d["label"] = ["l1", "l2"]
tmp = d.groupby(["group"]).mean(numeric_only=True)
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
tm.assert_numpy_array_equal(tmp.values, res_values)


Expand Down Expand Up @@ -2711,7 +2710,6 @@ def test_groupby_all_nan_groups_drop():
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_empty_multi_column(as_index, numeric_only):
# GH 15106 & GH 41998
Expand All @@ -2720,15 +2718,14 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
result = gb.sum(numeric_only=numeric_only)
if as_index:
index = MultiIndex([[], []], [[], []], names=["A", "B"])
columns = ["C"] if not numeric_only else []
columns = ["C"] if not numeric_only else Index([], dtype="str")
else:
index = RangeIndex(0)
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
expected = DataFrame([], columns=columns, index=index)
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_aggregation_non_numeric_dtype():
# GH #43108
df = DataFrame(
Expand All @@ -2739,7 +2736,7 @@ def test_groupby_aggregation_non_numeric_dtype():
{
"v": [[1, 1], [10, 20]],
},
index=Index(["M", "W"], dtype="object", name="MW"),
index=Index(["M", "W"], name="MW"),
)

gb = df.groupby(by=["MW"])
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat.pyarrow import pa_version_under10p1

from pandas.core.dtypes.missing import na_value_for_dtype
Expand Down Expand Up @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
tm.assert_frame_equal(grouped, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"dropna, idx, outputs",
[
Expand All @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()

expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))

tm.assert_frame_equal(grouped, expected)

Expand Down
10 changes: 4 additions & 6 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -844,7 +842,6 @@ def test_groupby_empty(self):
expected = ["name"]
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_level_index_value_all_na(self):
# issue 20519
df = DataFrame(
Expand All @@ -854,7 +851,7 @@ def test_groupby_level_index_value_all_na(self):
expected = DataFrame(
data=[],
index=MultiIndex(
levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
codes=[[], []],
names=["A", "B"],
),
Expand Down Expand Up @@ -989,12 +986,13 @@ def test_groupby_with_empty(self):
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_with_single_column(self):
df = DataFrame({"a": list("abssbab")})
tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
# GH 13530
exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
exp = DataFrame(
index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
)
tm.assert_frame_equal(df.groupby("a").count(), exp)
tm.assert_frame_equal(df.groupby("a").sum(), exp)

Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/groupby/test_pipe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
Expand All @@ -11,7 +8,6 @@
import pandas._testing as tm


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_pipe():
# Test the pipe method of DataFrameGroupBy.
# Issue #17871
Expand Down Expand Up @@ -39,7 +35,7 @@ def square(srs):
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)

index = Index(["bar", "foo"], dtype="object", name="A")
index = Index(["bar", "foo"], name="A")
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)

tm.assert_series_equal(expected, result)
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs import iNaT

from pandas.core.dtypes.common import pandas_dtype
Expand Down Expand Up @@ -457,8 +455,7 @@ def test_max_min_non_numeric():
assert "ss" in result


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_max_min_object_multiple_columns(using_array_manager):
def test_max_min_object_multiple_columns(using_array_manager, using_infer_string):
# GH#41111 case where the aggregation is valid for some columns but not
# others; we split object blocks column-wise, consistent with
# DataFrame._reduce
Expand All @@ -472,7 +469,7 @@ def test_max_min_object_multiple_columns(using_array_manager):
)
df._consolidate_inplace() # should already be consolidate, but double-check
if not using_array_manager:
assert len(df._mgr.blocks) == 2
assert len(df._mgr.blocks) == 3 if using_infer_string else 2

gb = df.groupby("A")

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):


class TestGroupBy:
# TODO(infer_string) resample sum introduces 0's
# https://github.com/pandas-dev/pandas/issues/60229
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_with_timegrouper(self):
# GH 4161
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import lib

from pandas.core.dtypes.common import ensure_platform_int
Expand Down Expand Up @@ -1229,20 +1227,19 @@ def test_groupby_transform_with_datetimes(func, values):
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_groupby_transform_dtype():
# GH 22243
df = DataFrame({"a": [1], "val": [1.35]})

result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
expected1 = Series(["+1.35"], name="val", dtype="object")
expected1 = Series(["+1.35"], name="val")
tm.assert_series_equal(result, expected1)

result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
tm.assert_series_equal(result, expected1)

result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
expected2 = Series(["+(1.35)"], name="val", dtype="object")
expected2 = Series(["+(1.35)"], name="val")
tm.assert_series_equal(result, expected2)

df["val"] = df["val"].astype(object)
Expand Down

0 comments on commit 9537b20

Please sign in to comment.