From 07ee82bb48e8c77f268ed9ce705d9a4bd5a8f32b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sat, 4 Jan 2025 11:15:07 -0600 Subject: [PATCH] Implement `.dt.total_seconds` (#17659) Fixes: #16802 This PR implements `.dt.total_seconds` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17659 --- python/cudf/cudf/core/column/timedelta.py | 13 ++++- python/cudf/cudf/core/index.py | 14 ++--- python/cudf/cudf/core/series.py | 62 ++++++++++++++++++++++- python/cudf/cudf/tests/test_timedelta.py | 24 ++++++++- 4 files changed, 102 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 417fa99dac0..749ab8e837a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime import functools +import math from typing import TYPE_CHECKING, cast import numpy as np @@ -263,7 +264,15 @@ def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] def total_seconds(self) -> ColumnBase: - raise NotImplementedError("total_seconds is currently not implemented") + conversion = _unit_to_nanoseconds_conversion[self.time_unit] / 1e9 + # Typecast to decimal128 to avoid floating point precision issues + # https://github.com/rapidsai/cudf/issues/17664 + return ( + (self.astype("int64") * conversion) + .astype(cudf.Decimal128Dtype(38, 9)) + .round(decimals=abs(int(math.log10(conversion)))) + .astype("float64") + ) def ceil(self, freq: str) -> ColumnBase: raise NotImplementedError("ceil is currently not implemented") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eac5b9d71ae..85be8d21d27 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -842,14 +842,14 @@ def sort_values( @_performance_tracking def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return cudf.Index._from_column( + return Index._from_column( self._column.take(gather_map, nullify, check_bounds), name=self.name, ) @_performance_tracking def _apply_boolean_mask(self, boolean_mask): - return cudf.Index._from_column( + return Index._from_column( self._column.apply_boolean_mask(boolean_mask), name=self.name ) @@ -857,7 +857,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_column( + return Index._from_column( self._as_int_index()._split(splits), name=self.name ) @@ -1657,7 +1657,7 @@ def _clean_nulls_from_index(self) -> Index: if isinstance(self, (DatetimeIndex, TimedeltaIndex)) else str(cudf.NA) ) - return cudf.Index._from_column( + return Index._from_column( self._column.astype("str").fillna(fill_value), name=self.name, ) @@ -2964,13 +2964,13 @@ def median(self, *, skipna: bool = True, axis: int | None = 0): def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1): return self._column.std(skipna=skipna, ddof=ddof) - def total_seconds(self) -> cupy.ndarray: + def total_seconds(self) -> Index: """ Return total duration of each element expressed in seconds. This method is currently not implemented. """ - return self._column.total_seconds().values + return Index._from_column(self._column.total_seconds(), name=self.name) def ceil(self, freq: str) -> Self: """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 961e5e11bc0..49c2c8cf387 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -5183,6 +5183,66 @@ def components(self) -> cudf.DataFrame: ca, index=self.series.index ) + def total_seconds(self) -> Series: + """ + Return total duration of each element expressed in seconds. + + This method is available directly on TimedeltaIndex + and on Series containing timedelta values under the ``.dt`` namespace. + + Returns + ------- + Index or Series + When the calling object is a TimedeltaIndex, + the return type is an Index with a float64 dtype. When the calling object + is a Series, the return type is Series of type `float64` whose + index is the same as the original. + + See Also + -------- + datetime.timedelta.total_seconds : Standard library version + of this method. + TimedeltaIndex.components : Return a DataFrame with components of + each Timedelta. + + Examples + -------- + **Series** + + >>> import cudf + >>> import pandas as pd + >>> import numpy as np + >>> s = cudf.Series(pd.to_timedelta(np.arange(5), unit="D")) + >>> s + 0 0 days 00:00:00 + 1 1 days 00:00:00 + 2 2 days 00:00:00 + 3 3 days 00:00:00 + 4 4 days 00:00:00 + dtype: timedelta64[ns] + + >>> s.dt.total_seconds() + 0 0.0 + 1 86400.0 + 2 172800.0 + 3 259200.0 + 4 345600.0 + dtype: float64 + + **TimedeltaIndex** + + >>> idx = cudf.from_pandas(pd.to_timedelta(np.arange(5), unit="D")) + >>> idx + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + >>> idx.total_seconds() + Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64') + """ + return self._return_result_like_self( + self.series._column.total_seconds() + ) + @_performance_tracking def _align_indices(series_list, how="outer", allow_non_unique=False): diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index d622ff6b94e..f1da2a060ec 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import operator @@ -1506,3 +1506,25 @@ def test_tdi_unit(): result = pd_tdi.unit expected = cudf_tdi.unit assert result == expected + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_series_total_seconds(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + expected = psr.dt.total_seconds() + actual = gsr.dt.total_seconds() + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_index_total_seconds(request, data, dtype): + gi = cudf.Index(data, dtype=dtype) + pi = gi.to_pandas() + + expected = pi.total_seconds() + actual = gi.total_seconds() + assert_eq(expected, actual)