From 1ccf8c4d23a5a3cf8322a97f0af08caf37387147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enrique=20Gonz=C3=A1lez=20Paredes?= Date: Wed, 25 Sep 2024 11:34:41 +0200 Subject: [PATCH] fix: add ROCm compatiblity to storages (#1655) Reimplement some cartesian storage utilities to handle better some corner cases with GPU storages, and to emulate the missing CUDA Array Interface in CuPy-ROCm. The previous hack to support CuPy-ROCm storages (`__hip_array_interface__`) has been removed and therefore it could be also removed from [GridTools-C++] (https://github.com/GridTools/gridtools/blob/master/include/gridtools/storage/adapter/python_sid_adapter.hpp) at some point. --- src/gt4py/_core/definitions.py | 4 ++ src/gt4py/storage/allocators.py | 12 ---- src/gt4py/storage/cartesian/utils.py | 94 ++++++++++++++++++++++++---- 3 files changed, 85 insertions(+), 25 deletions(-) diff --git a/src/gt4py/_core/definitions.py b/src/gt4py/_core/definitions.py index 01fbd51476..9d07b2eb79 100644 --- a/src/gt4py/_core/definitions.py +++ b/src/gt4py/_core/definitions.py @@ -379,6 +379,8 @@ class DeviceType(enum.IntEnum): METAL = 8 VPI = 9 ROCM = 10 + CUDA_MANAGED = 13 + ONE_API = 14 CPUDeviceTyping: TypeAlias = Literal[DeviceType.CPU] @@ -389,6 +391,8 @@ class DeviceType(enum.IntEnum): MetalDeviceTyping: TypeAlias = Literal[DeviceType.METAL] VPIDeviceTyping: TypeAlias = Literal[DeviceType.VPI] ROCMDeviceTyping: TypeAlias = Literal[DeviceType.ROCM] +CUDAManagedDeviceTyping: TypeAlias = Literal[DeviceType.CUDA_MANAGED] +OneApiDeviceTyping: TypeAlias = Literal[DeviceType.ONE_API] DeviceTypeT = TypeVar( diff --git a/src/gt4py/storage/allocators.py b/src/gt4py/storage/allocators.py index fa9005e86b..298b9c2e5a 100644 --- a/src/gt4py/storage/allocators.py +++ b/src/gt4py/storage/allocators.py @@ -259,18 +259,6 @@ def allocate( buffer, dtype, shape, padded_shape, item_size, strides, byte_offset ) - if self.device_type == core_defs.DeviceType.ROCM: - # until we can rely on dlpack - ndarray.__hip_array_interface__ = { # type: ignore[attr-defined] - "shape": ndarray.shape, # type: ignore[union-attr] - "typestr": ndarray.dtype.descr[0][1], # type: ignore[union-attr] - "descr": ndarray.dtype.descr, # type: ignore[union-attr] - "stream": 1, - "version": 3, - "strides": ndarray.strides, # type: ignore[union-attr, attr-defined] - "data": (ndarray.data.ptr, False), # type: ignore[union-attr, attr-defined] - } - return TensorBuffer( buffer=buffer, memory_address=memory_address, diff --git a/src/gt4py/storage/cartesian/utils.py b/src/gt4py/storage/cartesian/utils.py index 0d7fcab201..052238fe24 100644 --- a/src/gt4py/storage/cartesian/utils.py +++ b/src/gt4py/storage/cartesian/utils.py @@ -9,6 +9,7 @@ from __future__ import annotations import collections.abc +import functools import math import numbers from typing import Any, Final, Literal, Optional, Sequence, Tuple, Union, cast @@ -52,13 +53,38 @@ if CUPY_DEVICE == core_defs.DeviceType.CUDA: _GPUBufferAllocator = allocators.NDArrayBufferAllocator( - device_type=core_defs.DeviceType.CUDA, array_utils=allocators.cupy_array_utils + device_type=core_defs.DeviceType.CUDA, + array_utils=allocators.cupy_array_utils, ) - else: + elif CUPY_DEVICE == core_defs.DeviceType.ROCM: _GPUBufferAllocator = allocators.NDArrayBufferAllocator( - device_type=core_defs.DeviceType.ROCM, array_utils=allocators.cupy_array_utils + device_type=core_defs.DeviceType.ROCM, + array_utils=allocators.cupy_array_utils, ) + class CUDAArrayInterfaceNDArray(cp.ndarray): + def __new__(cls, input_array: "cp.ndarray") -> CUDAArrayInterfaceNDArray: + return ( + input_array + if isinstance(input_array, CUDAArrayInterfaceNDArray) + else cp.asarray(input_array).view(cls) + ) + + @property + def __cuda_array_interface__(self) -> dict: + return { + "shape": self.shape, + "typestr": self.dtype.descr[0][1], + "descr": self.dtype.descr, + "stream": 1, + "version": 3, + "strides": self.strides, + "data": (self.data.ptr, False), + } + + else: + raise ValueError("CuPy is available but no suitable device was found.") + def _idx_from_order(order): return list(np.argsort(order)) @@ -188,14 +214,36 @@ def asarray( # extract the buffer from a gt4py.next.Field # TODO(havogt): probably `Field` should provide the array interface methods when applicable array = array.ndarray - if device == "gpu" or (not device and hasattr(array, "__cuda_array_interface__")): - return cp.asarray(array) - if device == "cpu" or ( - not device and (hasattr(array, "__array_interface__") or hasattr(array, "__array__")) - ): - return np.asarray(array) - - if device: + + xp = None + if device == "cpu": + xp = np + elif device == "gpu": + assert cp is not None + xp = cp + elif not device: + if hasattr(array, "__dlpack_device__"): + kind, _ = array.__dlpack_device__() + if kind in [core_defs.DeviceType.CPU, core_defs.DeviceType.CPU_PINNED]: + xp = np + elif kind in [ + core_defs.DeviceType.CUDA, + core_defs.DeviceType.ROCM, + ]: + if cp is None: + raise RuntimeError("CuPy is required for GPU arrays") + xp = cp + elif hasattr(array, "__cuda_array_interface__"): + if cp is None: + raise RuntimeError("CuPy is required for GPU arrays") + xp = cp + elif hasattr(array, "__array_interface__") or hasattr(array, "__array__"): + xp = np + + if xp: + return xp.asarray(array) + + if device is not None: raise ValueError(f"Invalid device: {device!s}") raise TypeError(f"Cannot convert {type(array)} to ndarray") @@ -241,9 +289,10 @@ def allocate_gpu( alignment_bytes: int, aligned_index: Optional[Sequence[int]], ) -> Tuple["cp.ndarray", "cp.ndarray"]: + assert cp is not None assert _GPUBufferAllocator is not None, "GPU allocation library or device not found" device = core_defs.Device( # type: ignore[type-var] - core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA, 0 + (core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA), 0 ) buffer = _GPUBufferAllocator.allocate( shape, @@ -253,4 +302,23 @@ def allocate_gpu( byte_alignment=alignment_bytes, aligned_index=aligned_index, ) - return buffer.buffer, cast("cp.ndarray", buffer.ndarray) + + buffer_ndarray = cast("cp.ndarray", buffer.ndarray) + + return buffer.buffer, buffer_ndarray + + +if CUPY_DEVICE == core_defs.DeviceType.ROCM: + + @functools.wraps(allocate_gpu) + def allocate_gpu_rocm( + shape: Sequence[int], + layout_map: allocators.BufferLayoutMap, + dtype: DTypeLike, + alignment_bytes: int, + aligned_index: Optional[Sequence[int]], + ) -> Tuple["cp.ndarray", "cp.ndarray"]: + buffer, ndarray = allocate_gpu(shape, layout_map, dtype, alignment_bytes, aligned_index) + return buffer, CUDAArrayInterfaceNDArray(ndarray) + + allocate_gpu = allocate_gpu_rocm