From 1ccf8c4d23a5a3cf8322a97f0af08caf37387147 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enrique=20Gonz=C3=A1lez=20Paredes?= <enriqueg@cscs.ch>
Date: Wed, 25 Sep 2024 11:34:41 +0200
Subject: [PATCH] fix: add ROCm compatiblity to storages (#1655)

Reimplement some cartesian storage utilities to handle better some
corner cases with GPU storages, and to emulate the missing CUDA Array
Interface in CuPy-ROCm. The previous hack to support CuPy-ROCm storages
(`__hip_array_interface__`) has been removed and therefore it could be
also removed from [GridTools-C++]
(https://github.com/GridTools/gridtools/blob/master/include/gridtools/storage/adapter/python_sid_adapter.hpp)
at some point.
---
 src/gt4py/_core/definitions.py       |  4 ++
 src/gt4py/storage/allocators.py      | 12 ----
 src/gt4py/storage/cartesian/utils.py | 94 ++++++++++++++++++++++++----
 3 files changed, 85 insertions(+), 25 deletions(-)

diff --git a/src/gt4py/_core/definitions.py b/src/gt4py/_core/definitions.py
index 01fbd51476..9d07b2eb79 100644
--- a/src/gt4py/_core/definitions.py
+++ b/src/gt4py/_core/definitions.py
@@ -379,6 +379,8 @@ class DeviceType(enum.IntEnum):
     METAL = 8
     VPI = 9
     ROCM = 10
+    CUDA_MANAGED = 13
+    ONE_API = 14
 
 
 CPUDeviceTyping: TypeAlias = Literal[DeviceType.CPU]
@@ -389,6 +391,8 @@ class DeviceType(enum.IntEnum):
 MetalDeviceTyping: TypeAlias = Literal[DeviceType.METAL]
 VPIDeviceTyping: TypeAlias = Literal[DeviceType.VPI]
 ROCMDeviceTyping: TypeAlias = Literal[DeviceType.ROCM]
+CUDAManagedDeviceTyping: TypeAlias = Literal[DeviceType.CUDA_MANAGED]
+OneApiDeviceTyping: TypeAlias = Literal[DeviceType.ONE_API]
 
 
 DeviceTypeT = TypeVar(
diff --git a/src/gt4py/storage/allocators.py b/src/gt4py/storage/allocators.py
index fa9005e86b..298b9c2e5a 100644
--- a/src/gt4py/storage/allocators.py
+++ b/src/gt4py/storage/allocators.py
@@ -259,18 +259,6 @@ def allocate(
             buffer, dtype, shape, padded_shape, item_size, strides, byte_offset
         )
 
-        if self.device_type == core_defs.DeviceType.ROCM:
-            # until we can rely on dlpack
-            ndarray.__hip_array_interface__ = {  # type: ignore[attr-defined]
-                "shape": ndarray.shape,  # type: ignore[union-attr]
-                "typestr": ndarray.dtype.descr[0][1],  # type: ignore[union-attr]
-                "descr": ndarray.dtype.descr,  # type: ignore[union-attr]
-                "stream": 1,
-                "version": 3,
-                "strides": ndarray.strides,  # type: ignore[union-attr, attr-defined]
-                "data": (ndarray.data.ptr, False),  # type: ignore[union-attr, attr-defined]
-            }
-
         return TensorBuffer(
             buffer=buffer,
             memory_address=memory_address,
diff --git a/src/gt4py/storage/cartesian/utils.py b/src/gt4py/storage/cartesian/utils.py
index 0d7fcab201..052238fe24 100644
--- a/src/gt4py/storage/cartesian/utils.py
+++ b/src/gt4py/storage/cartesian/utils.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import collections.abc
+import functools
 import math
 import numbers
 from typing import Any, Final, Literal, Optional, Sequence, Tuple, Union, cast
@@ -52,13 +53,38 @@
 
     if CUPY_DEVICE == core_defs.DeviceType.CUDA:
         _GPUBufferAllocator = allocators.NDArrayBufferAllocator(
-            device_type=core_defs.DeviceType.CUDA, array_utils=allocators.cupy_array_utils
+            device_type=core_defs.DeviceType.CUDA,
+            array_utils=allocators.cupy_array_utils,
         )
-    else:
+    elif CUPY_DEVICE == core_defs.DeviceType.ROCM:
         _GPUBufferAllocator = allocators.NDArrayBufferAllocator(
-            device_type=core_defs.DeviceType.ROCM, array_utils=allocators.cupy_array_utils
+            device_type=core_defs.DeviceType.ROCM,
+            array_utils=allocators.cupy_array_utils,
         )
 
+        class CUDAArrayInterfaceNDArray(cp.ndarray):
+            def __new__(cls, input_array: "cp.ndarray") -> CUDAArrayInterfaceNDArray:
+                return (
+                    input_array
+                    if isinstance(input_array, CUDAArrayInterfaceNDArray)
+                    else cp.asarray(input_array).view(cls)
+                )
+
+            @property
+            def __cuda_array_interface__(self) -> dict:
+                return {
+                    "shape": self.shape,
+                    "typestr": self.dtype.descr[0][1],
+                    "descr": self.dtype.descr,
+                    "stream": 1,
+                    "version": 3,
+                    "strides": self.strides,
+                    "data": (self.data.ptr, False),
+                }
+
+    else:
+        raise ValueError("CuPy is available but no suitable device was found.")
+
 
 def _idx_from_order(order):
     return list(np.argsort(order))
@@ -188,14 +214,36 @@ def asarray(
         # extract the buffer from a gt4py.next.Field
         # TODO(havogt): probably `Field` should provide the array interface methods when applicable
         array = array.ndarray
-    if device == "gpu" or (not device and hasattr(array, "__cuda_array_interface__")):
-        return cp.asarray(array)
-    if device == "cpu" or (
-        not device and (hasattr(array, "__array_interface__") or hasattr(array, "__array__"))
-    ):
-        return np.asarray(array)
-
-    if device:
+
+    xp = None
+    if device == "cpu":
+        xp = np
+    elif device == "gpu":
+        assert cp is not None
+        xp = cp
+    elif not device:
+        if hasattr(array, "__dlpack_device__"):
+            kind, _ = array.__dlpack_device__()
+            if kind in [core_defs.DeviceType.CPU, core_defs.DeviceType.CPU_PINNED]:
+                xp = np
+            elif kind in [
+                core_defs.DeviceType.CUDA,
+                core_defs.DeviceType.ROCM,
+            ]:
+                if cp is None:
+                    raise RuntimeError("CuPy is required for GPU arrays")
+                xp = cp
+        elif hasattr(array, "__cuda_array_interface__"):
+            if cp is None:
+                raise RuntimeError("CuPy is required for GPU arrays")
+            xp = cp
+        elif hasattr(array, "__array_interface__") or hasattr(array, "__array__"):
+            xp = np
+
+    if xp:
+        return xp.asarray(array)
+
+    if device is not None:
         raise ValueError(f"Invalid device: {device!s}")
 
     raise TypeError(f"Cannot convert {type(array)} to ndarray")
@@ -241,9 +289,10 @@ def allocate_gpu(
     alignment_bytes: int,
     aligned_index: Optional[Sequence[int]],
 ) -> Tuple["cp.ndarray", "cp.ndarray"]:
+    assert cp is not None
     assert _GPUBufferAllocator is not None, "GPU allocation library or device not found"
     device = core_defs.Device(  # type: ignore[type-var]
-        core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA, 0
+        (core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA), 0
     )
     buffer = _GPUBufferAllocator.allocate(
         shape,
@@ -253,4 +302,23 @@ def allocate_gpu(
         byte_alignment=alignment_bytes,
         aligned_index=aligned_index,
     )
-    return buffer.buffer, cast("cp.ndarray", buffer.ndarray)
+
+    buffer_ndarray = cast("cp.ndarray", buffer.ndarray)
+
+    return buffer.buffer, buffer_ndarray
+
+
+if CUPY_DEVICE == core_defs.DeviceType.ROCM:
+
+    @functools.wraps(allocate_gpu)
+    def allocate_gpu_rocm(
+        shape: Sequence[int],
+        layout_map: allocators.BufferLayoutMap,
+        dtype: DTypeLike,
+        alignment_bytes: int,
+        aligned_index: Optional[Sequence[int]],
+    ) -> Tuple["cp.ndarray", "cp.ndarray"]:
+        buffer, ndarray = allocate_gpu(shape, layout_map, dtype, alignment_bytes, aligned_index)
+        return buffer, CUDAArrayInterfaceNDArray(ndarray)
+
+    allocate_gpu = allocate_gpu_rocm