From 88d925187566cf187a9a4d5c4619e00e2a9fb9b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 30 Dec 2024 14:48:13 -0800
Subject: [PATCH] Avoid double MultiIndex factorization in groupby index result
 (#17644)

From an offline discussion, it was discovered that a `groupby` with multiple keys experienced a slowdown somewhere between 24.02 and 24.12. An identified hot spot was the creation of the resulting `MultiIndex` where each resulting level undergoes factorization upon construction (which changed somewhere in between these releases to fix consistency bugs)

While the eager factorization is a new performance penalty, I identified that this factorization was being done twice unnecessarily when performing an `agg`. This PR ensures we avoid creating this MultiIndex until necessary and cache this MultiIndex result to avoid double factorization in the future

```python
# PR
In [1]: import cudf
   ...:
   ...: df_train = cudf.datasets.randomdata(nrows=50_000_000, dtypes={"label": int, "weekday": int, "cat_2": int, "brand": int})

In [2]:     target = "label"
   ...:     col = ['weekday', 'cat_2', 'brand']

In [3]: %%time
   ...: df_train[col + [target]].groupby(col).agg(['mean', 'count'])
   ...:
   ...:
# PR
CPU times: user 366 ms, sys: 109 ms, total: 474 ms
Wall time: 482 ms

#branch 25.02
CPU times: user 547 ms, sys: 112 ms, total: 659 ms
Wall time: 658 ms
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17644
---
 python/cudf/cudf/core/groupby/groupby.py | 25 ++++++++++++------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 9b510d4d308..4137109cc96 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import copy
+import functools
 import itertools
 import textwrap
 import types
@@ -30,7 +31,7 @@
 from cudf.core._internals import aggregation, sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column.column import ColumnBase, as_column
+from cudf.core.column.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import GatherMap
 from cudf.core.dtypes import (
@@ -747,7 +748,7 @@ def _groupby(self) -> types.SimpleNamespace:
                 plc.Table(
                     [
                         col.to_pylibcudf(mode="read")
-                        for col in self.grouping.keys._columns
+                        for col in self.grouping._key_columns
                     ]
                 ),
                 plc.types.NullPolicy.EXCLUDE
@@ -1049,8 +1050,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             ) and not _is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
-                right_cols = list(result_index._columns)
+                left_cols = self.grouping.keys.drop_duplicates()._columns
+                right_cols = result_index._columns
                 join_keys = [
                     _match_join_keys(lcol, rcol, "inner")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -2482,7 +2483,7 @@ def _cov_or_corr(self, func, method_name):
 
         column_pair_groupby = cudf.DataFrame._from_data(
             column_pair_structs
-        ).groupby(by=self.grouping.keys)
+        ).groupby(by=self.grouping)
 
         try:
             gb_cov_corr = column_pair_groupby.agg(func)
@@ -3506,7 +3507,9 @@ def _handle_by_or_level(self, by=None, level=None):
                 self._handle_level(level)
         else:
             by_list = by if isinstance(by, list) else [by]
-
+            if not len(self._obj) and not len(by_list):
+                # We pretend to groupby an empty column
+                by_list = [cudf.Index._from_column(column_empty(0))]
             for by in by_list:
                 if callable(by):
                     self._handle_callable(by)
@@ -3528,16 +3531,12 @@ def _handle_by_or_level(self, by=None, level=None):
                     except (KeyError, TypeError):
                         self._handle_misc(by)
 
-    @property
+    @functools.cached_property
     def keys(self):
         """Return grouping key columns as index"""
-        nkeys = len(self._key_columns)
-
-        if nkeys == 0:
-            return cudf.Index([], name=None)
-        elif nkeys > 1:
+        if len(self._key_columns) > 1:
             return cudf.MultiIndex._from_data(
-                dict(zip(range(nkeys), self._key_columns))
+                dict(enumerate(self._key_columns))
             )._set_names(self.names)
         else:
             return cudf.Index._from_column(