From bb45798f619f29683da452f788e73b40040d1455 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:34:33 +0100 Subject: [PATCH] Fix value_counts with split_out != 1 (#1170) --- dask_expr/_reductions.py | 3 ++- dask_expr/tests/test_groupby.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py index 357b4849..ce4a0de7 100644 --- a/dask_expr/_reductions.py +++ b/dask_expr/_reductions.py @@ -244,7 +244,8 @@ def _lower(self): # Reset the index if we we used it for shuffling if split_by_index: - shuffled = SetIndexBlockwise(shuffled, split_by, True, None) + idx = list(self._meta.index.names) if split_by != ["index"] else split_by + shuffled = SetIndexBlockwise(shuffled, idx, True, None) # Convert back to Series if necessary if self.shuffle_by_index is not False: diff --git a/dask_expr/tests/test_groupby.py b/dask_expr/tests/test_groupby.py index e5b45e6d..05ca7f3c 100644 --- a/dask_expr/tests/test_groupby.py +++ b/dask_expr/tests/test_groupby.py @@ -157,6 +157,13 @@ def test_groupby_no_numeric_only(pdf, func): assert_eq(agg, expect) +def test_value_counts_split_out(pdf): + df = from_pandas(pdf, npartitions=10) + result = df.groupby("x").y.value_counts(split_out=True) + expected = pdf.groupby("x").y.value_counts() + assert_eq(result, expected) + + def test_unique(df, pdf): result = df.groupby("x")["y"].unique() expected = pdf.groupby("x")["y"].unique()