rapidsai · rapids-bot · Jan 29, 2025 · Dec 19, 2024 · Jan 24, 2025 · Jan 24, 2025
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
@@ -142,6 +142,52 @@ cuDF (learn more in [this
 blog](https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d)) and the [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/)
 provides a similar configuration-based plugin for Spark.
 
+## How do I know if an object is a `cudf.pandas` proxy object?
+
+To determine if an object is a `cudf.pandas` proxy object, you can use the `is_cudf_pandas_obj` API. This function checks if the given object is a proxy object that wraps either a `cudf` or `pandas` object. Here is an example of how to use this API:
+
+```python
+from cudf.pandas import is_cudf_pandas_obj
+
+obj = ...  # Your object here
+if is_cudf_pandas_obj(obj):
+    print("The object is a cudf.pandas proxy object.")
+else:
+    print("The object is not a cudf.pandas proxy object.")
+```
+
+There are APIs to detect `Series`, `DataFrame`, `Index`, and `ndarray` objects separately:
+
+* `is_cudf_pandas_series`: Detects if the object is a `cudf.pandas` proxy `Series`.
+* `is_cudf_pandas_dataframe`: Detects if the object is a `cudf.pandas` proxy `DataFrame`.
+* `is_cudf_pandas_index`: Detects if the object is a `cudf.pandas` proxy `Index`.
+* `is_cudf_pandas_ndarray`: Detects if the object is a `cudf.pandas` proxy `ndarray`.
+
+## How can I access the underlying GPU or CPU objects?
+
+When working with `cudf.pandas` proxy objects, it is sometimes necessary to get true `cudf` or `pandas` objects that reside on GPU or CPU.
+For example, this can be used to ensure that GPU-aware libraries that support both `cudf` and `pandas` can use the `cudf`-optimized code paths that keep data on GPU when processing `cudf.pandas` objects.
+Otherwise, the library might use less-optimized CPU code because it thinks that the `cudf.pandas` object is a plain `pandas` dataframe.
+
+The following methods can be used to retrieve the actual `cudf` or `pandas` objects:
+
+- `as_gpu_object()`: This method returns the `cudf` object from the proxy.
+- `as_cpu_object()`: This method returns the `pandas` object from the proxy.
+
+If `as_gpu_object()` is called on a proxy array, it will return a `cupy` array and `as_cpu_object` will return a `numpy` array.
+
+Here is an example of how to use these methods:
+
+```python
+# Assuming `proxy_obj` is a cudf.pandas proxy object
+cudf_obj = proxy_obj.as_gpu_object()
+pandas_obj = proxy_obj.as_cpu_object()
+
+# Now you can use `cudf_obj` and `pandas_obj` with libraries that are cudf or pandas aware
+```
+
+Be aware that if `cudf.pandas` objects are converted to their underlying `cudf` or `pandas` types, the `cudf.pandas` proxy no longer controls them. This means that automatic conversion between GPU and CPU types and automatic fallback from GPU to CPU functionality will not occur.
+
 (are-there-any-known-limitations)=
 ## Are there any known limitations?
 

@@ -1251,7 +1251,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
             )
 
         # Categories must be unique and sorted in ascending order.
-        cats = self.unique().sort_values().astype(self.dtype)
+        cats = self.unique().sort_values()
         label_dtype = min_unsigned_type(len(cats))
         labels = self._label_encoding(
             cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -96,6 +96,7 @@
 from cudf.utils.utils import (
     GetAttrGetItemMixin,
     _external_only_api,
+    _extract_from_proxy,
     _is_null_host_scalar,
 )
 
@@ -707,9 +708,23 @@ def __init__(
         if copy is not None:
             raise NotImplementedError("copy is not currently implemented.")
         super().__init__({}, index=cudf.Index([]))
+
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
+        if cudf.get_option("mode.pandas_compatible"):
+            data, data_extracted = _extract_from_proxy(data)
+            index, index_extracted = _extract_from_proxy(index)
+            columns, columns_extracted = _extract_from_proxy(
+                columns, fast=False
+            )
+            if (
+                (data is None or data_extracted)
+                and (index is None or index_extracted)
+                and (columns is None or columns_extracted)
+            ) and (dtype is None and copy is None):
+                self.__dict__.update(data.__dict__)
+                return
         if isinstance(columns, (Series, cudf.BaseIndex)):
             columns = columns.to_pandas()
 

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -57,7 +57,11 @@
     is_mixed_with_object_dtype,
 )
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import _warn_no_dask_cudf, search_range
+from cudf.utils.utils import (
+    _extract_from_proxy,
+    _warn_no_dask_cudf,
+    search_range,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
@@ -1067,6 +1071,12 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
     @_performance_tracking
     def __init__(self, data, **kwargs):
         name = _getdefault_name(data, name=kwargs.get("name"))
+        if cudf.get_option("mode.pandas_compatible"):
+            data, data_extracted = _extract_from_proxy(data)
+            if data_extracted and len(kwargs) == 0:
+                self.__dict__.update(data.__dict__)
+                return
+
         super().__init__({name: data})
 
     @_performance_tracking

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -68,6 +68,9 @@
     to_cudf_compatible_scalar,
 )
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import (
+    _extract_from_proxy,
+)
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -626,6 +629,20 @@ def __init__(
     ):
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
+
+        if cudf.get_option("mode.pandas_compatible"):
+            data, data_extracted = _extract_from_proxy(data)
+            index, _ = _extract_from_proxy(index)
+
+            if (
+                data_extracted
+                and index is None
+                and dtype is None
+                and name is None
+                and copy is False
+            ):
+                self.__dict__.update(data.__dict__)
+                return
         index_from_data = None
         name_from_data = None
         if data is None:

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -8,6 +8,13 @@
 import pylibcudf
 import rmm.mr
 
+from ._wrappers.numpy import is_cudf_pandas_nd_array
+from ._wrappers.pandas import (
+    is_cudf_pandas_dataframe,
+    is_cudf_pandas_index,
+    is_cudf_pandas_obj,
+    is_cudf_pandas_series,
+)
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -176,3 +176,7 @@ def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     cupy._core.flags.Flags,
     _numpy_flagsobj,
 )
+
+
+def is_cudf_pandas_ndarray(obj):
+    return is_proxy_object(obj) and isinstance(obj, ndarray)
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import abc
@@ -35,7 +35,9 @@
     _fast_slow_function_call,
     _FastSlowAttribute,
     _FunctionProxy,
+    _maybe_wrap_result,
     _Unusable,
+    is_proxy_object,
     make_final_proxy_type as _make_final_proxy_type,
     make_intermediate_proxy_type as _make_intermediate_proxy_type,
     register_proxy_func,
@@ -266,6 +268,12 @@ def custom_repr_html(obj):
     html_formatter.for_type(DataFrame, custom_repr_html)
 
 
+def _Series_dtype(self):
+    # Fast-path to extract dtype from the current
+    # object without round-tripping through the slow<->fast
+    return _maybe_wrap_result(self._fsproxy_wrapped.dtype, None)
+
+
 Series = make_final_proxy_type(
     "Series",
     cudf.Series,
@@ -285,6 +293,7 @@ def custom_repr_html(obj):
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
         "_accessors": set(),
+        "dtype": _Series_dtype,
     },
 )
 
@@ -1704,6 +1713,22 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     )
 
 
+def is_cudf_pandas_obj(obj):
+    return is_proxy_object(obj)
+
+
+def is_cudf_pandas_dataframe(obj):
+    return is_proxy_object(obj) and isinstance(obj, DataFrame)
+
+
+def is_cudf_pandas_series(obj):
+    return is_proxy_object(obj) and isinstance(obj, Series)
+
+
+def is_cudf_pandas_index(obj):
+    return is_proxy_object(obj) and isinstance(obj, Index)
+
+
 # timestamps and timedeltas are not proxied, but non-proxied
 # pandas types are currently not picklable. Thus, we define
 # custom reducer/unpicker functions for these types:

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -204,6 +204,12 @@ def _fsproxy_fast_to_slow(self):
             return fast_to_slow(self._fsproxy_wrapped)
         return self._fsproxy_wrapped
 
+    def as_gpu_object(self):
+        return self._fsproxy_slow_to_fast()
+
+    def as_cpu_object(self):
+        return self._fsproxy_fast_to_slow()
+
     @property  # type: ignore
     def _fsproxy_state(self) -> _State:
         return (
@@ -221,6 +227,8 @@ def _fsproxy_state(self) -> _State:
         "_fsproxy_slow_type": slow_type,
         "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast,
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
+        "as_gpu_object": as_gpu_object,
+        "as_cpu_object": as_cpu_object,
         "_fsproxy_state": _fsproxy_state,
     }
 

diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import decimal
@@ -451,3 +451,17 @@ def _datetime_timedelta_find_and_replace(
     except TypeError:
         result_col = original_column.copy(deep=True)
     return result_col  # type: ignore
+
+
+def _extract_from_proxy(proxy: Any, fast: bool = True) -> tuple[Any, bool]:
+    """
+    Extract the object from a proxy object.
+    """
+    try:
+        return (
+            (proxy.as_gpu_object(), True)
+            if fast
+            else (proxy.as_cpu_object(), True)
+        )
+    except AttributeError:
+        return (proxy, False)
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -65,6 +65,14 @@
     get_calendar,
 )
 
+from cudf.pandas import (
+    is_cudf_pandas_dataframe,
+    is_cudf_pandas_index,
+    is_cudf_pandas_ndarray,
+    is_cudf_pandas_obj,
+    is_cudf_pandas_series,
+)
+
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -1885,3 +1893,40 @@ def test_dataframe_setitem():
     new_df = df + 1
     df[df.columns] = new_df
     tm.assert_equal(df, new_df)
+
+
+def test_dataframe_get_fast_slow_methods():
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    assert isinstance(df.as_gpu_object(), cudf.DataFrame)
+    assert isinstance(df.as_cpu_object(), pd.DataFrame)
+
+
+def test_is_cudf_pandas():
+    s = xpd.Series([1, 2, 3])
+    df = xpd.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    index = xpd.Index([1, 2, 3])
+    assert is_cudf_pandas_obj(s)
+    assert is_cudf_pandas_obj(df)
+    assert is_cudf_pandas_obj(index)
+    assert is_cudf_pandas_obj(index.values)
+
+    assert is_cudf_pandas_series(s)
+    assert is_cudf_pandas_dataframe(df)
+    assert is_cudf_pandas_index(index)
+    assert is_cudf_pandas_ndarray(index.values)
+
+    for obj in [s, df, index, index.values]:
+        assert not is_cudf_pandas_obj(obj._fsproxy_slow)
+        assert not is_cudf_pandas_obj(obj._fsproxy_fast)
+
+        assert not is_cudf_pandas_series(obj._fsproxy_slow)
+        assert not is_cudf_pandas_series(obj._fsproxy_fast)
+
+        assert not is_cudf_pandas_dataframe(obj._fsproxy_slow)
+        assert not is_cudf_pandas_dataframe(obj._fsproxy_fast)
+
+        assert not is_cudf_pandas_index(obj._fsproxy_slow)
+        assert not is_cudf_pandas_index(obj._fsproxy_fast)
+
+        assert not is_cudf_pandas_nd_array(obj._fsproxy_slow)
+        assert not is_cudf_pandas_nd_array(obj._fsproxy_fast)