Skip to content

Commit

Permalink
Avoid shallow copies in groupby methods (#17646)
Browse files Browse the repository at this point in the history
Noticed while working on #17644 that `diff` and `fillna` were make some unnecessary shallow copies of the `grouping.value` object. Also noticed that `_cov_or_corr` just pulled the column names out of `grouping.value` object, so made a separate API, `values_column_names` to just create the column names without pulling out the actual columns.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #17646
  • Loading branch information
mroeschke authored Jan 4, 2025
1 parent 07406b3 commit 756d66b
Showing 1 changed file with 17 additions and 15 deletions.
32 changes: 17 additions & 15 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from __future__ import annotations

import copy
Expand Down Expand Up @@ -49,7 +49,7 @@
from cudf.utils.utils import GetAttrGetItemMixin

if TYPE_CHECKING:
from collections.abc import Generator, Iterable
from collections.abc import Generator, Hashable, Iterable

from cudf._typing import (
AggType,
Expand Down Expand Up @@ -2448,7 +2448,7 @@ def _cov_or_corr(self, func, method_name):
# create expanded dataframe consisting all combinations of the
# struct columns-pairs to be used in the correlation or covariance
# i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
column_names = self.grouping.values._column_names
column_names = self.grouping._values_column_names
num_cols = len(column_names)

column_pair_structs = {}
Expand Down Expand Up @@ -2682,10 +2682,8 @@ def diff(self, periods=1, axis=0):

if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")

values = self.obj.__class__._from_data(
self.grouping.values._data, self.obj.index
)
values = self.grouping.values
values.index = self.obj.index
return values - self.shift(periods=periods)

def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
Expand Down Expand Up @@ -2789,9 +2787,8 @@ def fillna(
raise ValueError("Method can only be of 'ffill', 'bfill'.")
return getattr(self, method, limit)()

values = self.obj.__class__._from_data(
self.grouping.values._data, self.obj.index
)
values = self.grouping.values
values.index = self.obj.index
return values.fillna(
value=value, inplace=inplace, axis=axis, limit=limit
)
Expand Down Expand Up @@ -3543,6 +3540,13 @@ def keys(self):
self._key_columns[0], name=self.names[0]
)

@property
def _values_column_names(self) -> list[Hashable]:
# If the key columns are in `obj`, filter them out
return [
x for x in self._obj._column_names if x not in self._named_columns
]

@property
def values(self) -> cudf.core.frame.Frame:
"""Return value columns as a frame.
Expand All @@ -3553,11 +3557,9 @@ def values(self) -> cudf.core.frame.Frame:
This is mainly used in transform-like operations.
"""
# If the key columns are in `obj`, filter them out
value_column_names = [
x for x in self._obj._column_names if x not in self._named_columns
]
value_columns = self._obj._data.select_by_label(value_column_names)
value_columns = self._obj._data.select_by_label(
self._values_column_names
)
return self._obj.__class__._from_data(value_columns)

def _handle_callable(self, by):
Expand Down

0 comments on commit 756d66b

Please sign in to comment.