Skip to content

Commit

Permalink
fix: add ROCm compatiblity to storages (#1655)
Browse files Browse the repository at this point in the history
Reimplement some cartesian storage utilities to handle better some
corner cases with GPU storages, and to emulate the missing CUDA Array
Interface in CuPy-ROCm. The previous hack to support CuPy-ROCm storages
(`__hip_array_interface__`) has been removed and therefore it could be
also removed from [GridTools-C++]
(https://github.com/GridTools/gridtools/blob/master/include/gridtools/storage/adapter/python_sid_adapter.hpp)
at some point.
  • Loading branch information
egparedes authored Sep 25, 2024
1 parent e10873d commit 1ccf8c4
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 25 deletions.
4 changes: 4 additions & 0 deletions src/gt4py/_core/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ class DeviceType(enum.IntEnum):
METAL = 8
VPI = 9
ROCM = 10
CUDA_MANAGED = 13
ONE_API = 14


CPUDeviceTyping: TypeAlias = Literal[DeviceType.CPU]
Expand All @@ -389,6 +391,8 @@ class DeviceType(enum.IntEnum):
MetalDeviceTyping: TypeAlias = Literal[DeviceType.METAL]
VPIDeviceTyping: TypeAlias = Literal[DeviceType.VPI]
ROCMDeviceTyping: TypeAlias = Literal[DeviceType.ROCM]
CUDAManagedDeviceTyping: TypeAlias = Literal[DeviceType.CUDA_MANAGED]
OneApiDeviceTyping: TypeAlias = Literal[DeviceType.ONE_API]


DeviceTypeT = TypeVar(
Expand Down
12 changes: 0 additions & 12 deletions src/gt4py/storage/allocators.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,18 +259,6 @@ def allocate(
buffer, dtype, shape, padded_shape, item_size, strides, byte_offset
)

if self.device_type == core_defs.DeviceType.ROCM:
# until we can rely on dlpack
ndarray.__hip_array_interface__ = { # type: ignore[attr-defined]
"shape": ndarray.shape, # type: ignore[union-attr]
"typestr": ndarray.dtype.descr[0][1], # type: ignore[union-attr]
"descr": ndarray.dtype.descr, # type: ignore[union-attr]
"stream": 1,
"version": 3,
"strides": ndarray.strides, # type: ignore[union-attr, attr-defined]
"data": (ndarray.data.ptr, False), # type: ignore[union-attr, attr-defined]
}

return TensorBuffer(
buffer=buffer,
memory_address=memory_address,
Expand Down
94 changes: 81 additions & 13 deletions src/gt4py/storage/cartesian/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from __future__ import annotations

import collections.abc
import functools
import math
import numbers
from typing import Any, Final, Literal, Optional, Sequence, Tuple, Union, cast
Expand Down Expand Up @@ -52,13 +53,38 @@

if CUPY_DEVICE == core_defs.DeviceType.CUDA:
_GPUBufferAllocator = allocators.NDArrayBufferAllocator(
device_type=core_defs.DeviceType.CUDA, array_utils=allocators.cupy_array_utils
device_type=core_defs.DeviceType.CUDA,
array_utils=allocators.cupy_array_utils,
)
else:
elif CUPY_DEVICE == core_defs.DeviceType.ROCM:
_GPUBufferAllocator = allocators.NDArrayBufferAllocator(
device_type=core_defs.DeviceType.ROCM, array_utils=allocators.cupy_array_utils
device_type=core_defs.DeviceType.ROCM,
array_utils=allocators.cupy_array_utils,
)

class CUDAArrayInterfaceNDArray(cp.ndarray):
def __new__(cls, input_array: "cp.ndarray") -> CUDAArrayInterfaceNDArray:
return (
input_array
if isinstance(input_array, CUDAArrayInterfaceNDArray)
else cp.asarray(input_array).view(cls)
)

@property
def __cuda_array_interface__(self) -> dict:
return {
"shape": self.shape,
"typestr": self.dtype.descr[0][1],
"descr": self.dtype.descr,
"stream": 1,
"version": 3,
"strides": self.strides,
"data": (self.data.ptr, False),
}

else:
raise ValueError("CuPy is available but no suitable device was found.")


def _idx_from_order(order):
return list(np.argsort(order))
Expand Down Expand Up @@ -188,14 +214,36 @@ def asarray(
# extract the buffer from a gt4py.next.Field
# TODO(havogt): probably `Field` should provide the array interface methods when applicable
array = array.ndarray
if device == "gpu" or (not device and hasattr(array, "__cuda_array_interface__")):
return cp.asarray(array)
if device == "cpu" or (
not device and (hasattr(array, "__array_interface__") or hasattr(array, "__array__"))
):
return np.asarray(array)

if device:

xp = None
if device == "cpu":
xp = np
elif device == "gpu":
assert cp is not None
xp = cp
elif not device:
if hasattr(array, "__dlpack_device__"):
kind, _ = array.__dlpack_device__()
if kind in [core_defs.DeviceType.CPU, core_defs.DeviceType.CPU_PINNED]:
xp = np
elif kind in [
core_defs.DeviceType.CUDA,
core_defs.DeviceType.ROCM,
]:
if cp is None:
raise RuntimeError("CuPy is required for GPU arrays")
xp = cp
elif hasattr(array, "__cuda_array_interface__"):
if cp is None:
raise RuntimeError("CuPy is required for GPU arrays")
xp = cp
elif hasattr(array, "__array_interface__") or hasattr(array, "__array__"):
xp = np

if xp:
return xp.asarray(array)

if device is not None:
raise ValueError(f"Invalid device: {device!s}")

raise TypeError(f"Cannot convert {type(array)} to ndarray")
Expand Down Expand Up @@ -241,9 +289,10 @@ def allocate_gpu(
alignment_bytes: int,
aligned_index: Optional[Sequence[int]],
) -> Tuple["cp.ndarray", "cp.ndarray"]:
assert cp is not None
assert _GPUBufferAllocator is not None, "GPU allocation library or device not found"
device = core_defs.Device( # type: ignore[type-var]
core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA, 0
(core_defs.DeviceType.ROCM if gt_config.GT4PY_USE_HIP else core_defs.DeviceType.CUDA), 0
)
buffer = _GPUBufferAllocator.allocate(
shape,
Expand All @@ -253,4 +302,23 @@ def allocate_gpu(
byte_alignment=alignment_bytes,
aligned_index=aligned_index,
)
return buffer.buffer, cast("cp.ndarray", buffer.ndarray)

buffer_ndarray = cast("cp.ndarray", buffer.ndarray)

return buffer.buffer, buffer_ndarray


if CUPY_DEVICE == core_defs.DeviceType.ROCM:

@functools.wraps(allocate_gpu)
def allocate_gpu_rocm(
shape: Sequence[int],
layout_map: allocators.BufferLayoutMap,
dtype: DTypeLike,
alignment_bytes: int,
aligned_index: Optional[Sequence[int]],
) -> Tuple["cp.ndarray", "cp.ndarray"]:
buffer, ndarray = allocate_gpu(shape, layout_map, dtype, alignment_bytes, aligned_index)
return buffer, CUDAArrayInterfaceNDArray(ndarray)

allocate_gpu = allocate_gpu_rocm

0 comments on commit 1ccf8c4

Please sign in to comment.