Skip to content

Commit

Permalink
Avoid double MultiIndex factorization in groupby index result (#17644)
Browse files Browse the repository at this point in the history
From an offline discussion, it was discovered that a `groupby` with multiple keys experienced a slowdown somewhere between 24.02 and 24.12. An identified hot spot was the creation of the resulting `MultiIndex` where each resulting level undergoes factorization upon construction (which changed somewhere in between these releases to fix consistency bugs)

While the eager factorization is a new performance penalty, I identified that this factorization was being done twice unnecessarily when performing an `agg`. This PR ensures we avoid creating this MultiIndex until necessary and cache this MultiIndex result to avoid double factorization in the future

```python
# PR
In [1]: import cudf
   ...: 
   ...: df_train = cudf.datasets.randomdata(nrows=50_000_000, dtypes={"label": int, "weekday": int, "cat_2": int, "brand": int})

In [2]:     target = "label"
   ...:     col = ['weekday', 'cat_2', 'brand']

In [3]: %%time
   ...: df_train[col + [target]].groupby(col).agg(['mean', 'count'])
   ...: 
   ...: 
# PR
CPU times: user 366 ms, sys: 109 ms, total: 474 ms
Wall time: 482 ms

#branch 25.02
CPU times: user 547 ms, sys: 112 ms, total: 659 ms
Wall time: 658 ms
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17644
  • Loading branch information
mroeschke authored Dec 30, 2024
1 parent e50cde7 commit 88d9251
Showing 1 changed file with 12 additions and 13 deletions.
25 changes: 12 additions & 13 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import copy
import functools
import itertools
import textwrap
import types
Expand Down Expand Up @@ -30,7 +31,7 @@
from cudf.core._internals import aggregation, sorting
from cudf.core.abc import Serializable
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column.column import ColumnBase, as_column
from cudf.core.column.column import ColumnBase, as_column, column_empty
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import GatherMap
from cudf.core.dtypes import (
Expand Down Expand Up @@ -747,7 +748,7 @@ def _groupby(self) -> types.SimpleNamespace:
plc.Table(
[
col.to_pylibcudf(mode="read")
for col in self.grouping.keys._columns
for col in self.grouping._key_columns
]
),
plc.types.NullPolicy.EXCLUDE
Expand Down Expand Up @@ -1049,8 +1050,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
) and not _is_all_scan_aggregate(normalized_aggs):
# Even with `sort=False`, pandas guarantees that
# groupby preserves the order of rows within each group.
left_cols = list(self.grouping.keys.drop_duplicates()._columns)
right_cols = list(result_index._columns)
left_cols = self.grouping.keys.drop_duplicates()._columns
right_cols = result_index._columns
join_keys = [
_match_join_keys(lcol, rcol, "inner")
for lcol, rcol in zip(left_cols, right_cols)
Expand Down Expand Up @@ -2482,7 +2483,7 @@ def _cov_or_corr(self, func, method_name):

column_pair_groupby = cudf.DataFrame._from_data(
column_pair_structs
).groupby(by=self.grouping.keys)
).groupby(by=self.grouping)

try:
gb_cov_corr = column_pair_groupby.agg(func)
Expand Down Expand Up @@ -3506,7 +3507,9 @@ def _handle_by_or_level(self, by=None, level=None):
self._handle_level(level)
else:
by_list = by if isinstance(by, list) else [by]

if not len(self._obj) and not len(by_list):
# We pretend to groupby an empty column
by_list = [cudf.Index._from_column(column_empty(0))]
for by in by_list:
if callable(by):
self._handle_callable(by)
Expand All @@ -3528,16 +3531,12 @@ def _handle_by_or_level(self, by=None, level=None):
except (KeyError, TypeError):
self._handle_misc(by)

@property
@functools.cached_property
def keys(self):
"""Return grouping key columns as index"""
nkeys = len(self._key_columns)

if nkeys == 0:
return cudf.Index([], name=None)
elif nkeys > 1:
if len(self._key_columns) > 1:
return cudf.MultiIndex._from_data(
dict(zip(range(nkeys), self._key_columns))
dict(enumerate(self._key_columns))
)._set_names(self.names)
else:
return cudf.Index._from_column(
Expand Down

0 comments on commit 88d9251

Please sign in to comment.