Avoid double MultiIndex factorization in groupby index result (#17644)

From an offline discussion, it was discovered that a `groupby` with multiple keys experienced a slowdown somewhere between 24.02 and 24.12. An identified hot spot was the creation of the resulting `MultiIndex` where each resulting level undergoes factorization upon construction (which changed somewhere in between these releases to fix consistency bugs) While the eager factorization is a new performance penalty, I identified that this factorization was being done twice unnecessarily when performing an `agg`. This PR ensures we avoid creating this MultiIndex until necessary and cache this MultiIndex result to avoid double factorization in the future ```python # PR In [1]: import cudf ...: ...: df_train = cudf.datasets.randomdata(nrows=50_000_000, dtypes={"label": int, "weekday": int, "cat_2": int, "brand": int}) In [2]: target = "label" ...: col = ['weekday', 'cat_2', 'brand'] In [3]: %%time ...: df_train[col + [target]].groupby(col).agg(['mean', 'count']) ...: ...: # PR CPU times: user 366 ms, sys: 109 ms, total: 474 ms Wall time: 482 ms #branch 25.02 CPU times: user 547 ms, sys: 112 ms, total: 659 ms Wall time: 658 ms ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: #17644
rapidsai · Dec 30, 2024 · 88d9251 · 88d9251
1 parent e50cde7
commit 88d9251
Showing 1 changed file with 12 additions and 13 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import copy
+import functools
 import itertools
 import textwrap
 import types
@@ -30,7 +31,7 @@
 from cudf.core._internals import aggregation, sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column.column import ColumnBase, as_column
+from cudf.core.column.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import GatherMap
 from cudf.core.dtypes import (
@@ -747,7 +748,7 @@ def _groupby(self) -> types.SimpleNamespace:
                 plc.Table(
                     [
                         col.to_pylibcudf(mode="read")
-                        for col in self.grouping.keys._columns
+                        for col in self.grouping._key_columns
                     ]
                 ),
                 plc.types.NullPolicy.EXCLUDE
@@ -1049,8 +1050,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not _is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
-                right_cols = list(result_index._columns)
+                left_cols = self.grouping.keys.drop_duplicates()._columns
+                right_cols = result_index._columns
                 join_keys = [
                     _match_join_keys(lcol, rcol, "inner")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -2482,7 +2483,7 @@ def _cov_or_corr(self, func, method_name):
 
         column_pair_groupby = cudf.DataFrame._from_data(
             column_pair_structs
-        ).groupby(by=self.grouping.keys)
+        ).groupby(by=self.grouping)
 
         try:
             gb_cov_corr = column_pair_groupby.agg(func)
@@ -3506,7 +3507,9 @@ def _handle_by_or_level(self, by=None, level=None):
                 self._handle_level(level)
         else:
             by_list = by if isinstance(by, list) else [by]
-
+            if not len(self._obj) and not len(by_list):
+                # We pretend to groupby an empty column
+                by_list = [cudf.Index._from_column(column_empty(0))]
             for by in by_list:
                 if callable(by):
                     self._handle_callable(by)
@@ -3528,16 +3531,12 @@ def _handle_by_or_level(self, by=None, level=None):
                     except (KeyError, TypeError):
                         self._handle_misc(by)
 
-    @property
+    @functools.cached_property
     def keys(self):
         """Return grouping key columns as index"""
-        nkeys = len(self._key_columns)
-
-        if nkeys == 0:
-            return cudf.Index([], name=None)
-        elif nkeys > 1:
+        if len(self._key_columns) > 1:
             return cudf.MultiIndex._from_data(
-                dict(zip(range(nkeys), self._key_columns))
+                dict(enumerate(self._key_columns))
             )._set_names(self.names)
         else:
             return cudf.Index._from_column(