From 88d925187566cf187a9a4d5c4619e00e2a9fb9b1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 30 Dec 2024 14:48:13 -0800 Subject: [PATCH] Avoid double MultiIndex factorization in groupby index result (#17644) From an offline discussion, it was discovered that a `groupby` with multiple keys experienced a slowdown somewhere between 24.02 and 24.12. An identified hot spot was the creation of the resulting `MultiIndex` where each resulting level undergoes factorization upon construction (which changed somewhere in between these releases to fix consistency bugs) While the eager factorization is a new performance penalty, I identified that this factorization was being done twice unnecessarily when performing an `agg`. This PR ensures we avoid creating this MultiIndex until necessary and cache this MultiIndex result to avoid double factorization in the future ```python # PR In [1]: import cudf ...: ...: df_train = cudf.datasets.randomdata(nrows=50_000_000, dtypes={"label": int, "weekday": int, "cat_2": int, "brand": int}) In [2]: target = "label" ...: col = ['weekday', 'cat_2', 'brand'] In [3]: %%time ...: df_train[col + [target]].groupby(col).agg(['mean', 'count']) ...: ...: # PR CPU times: user 366 ms, sys: 109 ms, total: 474 ms Wall time: 482 ms #branch 25.02 CPU times: user 547 ms, sys: 112 ms, total: 659 ms Wall time: 658 ms ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17644 --- python/cudf/cudf/core/groupby/groupby.py | 25 ++++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 9b510d4d308..4137109cc96 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import functools import itertools import textwrap import types @@ -30,7 +31,7 @@ from cudf.core._internals import aggregation, sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock -from cudf.core.column.column import ColumnBase, as_column +from cudf.core.column.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import GatherMap from cudf.core.dtypes import ( @@ -747,7 +748,7 @@ def _groupby(self) -> types.SimpleNamespace: plc.Table( [ col.to_pylibcudf(mode="read") - for col in self.grouping.keys._columns + for col in self.grouping._key_columns ] ), plc.types.NullPolicy.EXCLUDE @@ -1049,8 +1050,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not _is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list(self.grouping.keys.drop_duplicates()._columns) - right_cols = list(result_index._columns) + left_cols = self.grouping.keys.drop_duplicates()._columns + right_cols = result_index._columns join_keys = [ _match_join_keys(lcol, rcol, "inner") for lcol, rcol in zip(left_cols, right_cols) @@ -2482,7 +2483,7 @@ def _cov_or_corr(self, func, method_name): column_pair_groupby = cudf.DataFrame._from_data( column_pair_structs - ).groupby(by=self.grouping.keys) + ).groupby(by=self.grouping) try: gb_cov_corr = column_pair_groupby.agg(func) @@ -3506,7 +3507,9 @@ def _handle_by_or_level(self, by=None, level=None): self._handle_level(level) else: by_list = by if isinstance(by, list) else [by] - + if not len(self._obj) and not len(by_list): + # We pretend to groupby an empty column + by_list = [cudf.Index._from_column(column_empty(0))] for by in by_list: if callable(by): self._handle_callable(by) @@ -3528,16 +3531,12 @@ def _handle_by_or_level(self, by=None, level=None): except (KeyError, TypeError): self._handle_misc(by) - @property + @functools.cached_property def keys(self): """Return grouping key columns as index""" - nkeys = len(self._key_columns) - - if nkeys == 0: - return cudf.Index([], name=None) - elif nkeys > 1: + if len(self._key_columns) > 1: return cudf.MultiIndex._from_data( - dict(zip(range(nkeys), self._key_columns)) + dict(enumerate(self._key_columns)) )._set_names(self.names) else: return cudf.Index._from_column(