forked from Xilinx/mlir-aie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_exp_alt.py
129 lines (103 loc) · 4.13 KB
/
vector_exp_alt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# vector_exp/vector_exp_alt.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import numpy as np
from ml_dtypes import bfloat16
from aie.dialects.aie import * # primary mlir-aie dialect definitions
from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper
from aie.dialects.aiex import * # extended mlir-aie dialect definitions
from aie.helpers.dialects.ext.scf import (
_for as range_,
) # scf (structured control flow) dialect
from aie.helpers.util import np_ndarray_type_get_shape
# AI Engine structural design function
def my_eltwise_exp():
N = 65536
# Tile sizes
n = 1024
N_div_n = N // n
n_cores = 4
tiles = N_div_n // n_cores
buffer_depth = 2
# Device declaration - aie2 device NPU (aka Ryzen AI)
@device(AIEDevice.npu1_1col)
def device_body():
tile_ty = np.ndarray[(n,), np.dtype[bfloat16]]
# Type used in the tile memory
A_ty = np.ndarray[(n,), np.dtype[bfloat16]]
C_ty = np.ndarray[(n,), np.dtype[bfloat16]]
# Type used in the memory tile which aggregates across the 4 cores
A_memTile_ty = np.ndarray[(n * n_cores,), np.dtype[bfloat16]]
C_memTile_ty = np.ndarray[(n * n_cores,), np.dtype[bfloat16]]
# AIE Core Function declarations
exp_bf16_1024 = external_func("exp_bf16_1024", inputs=[tile_ty, tile_ty])
# Tile declarations
ShimTile = tile(0, 0)
MemTile = tile(0, 1)
cores = [tile(0, 2 + i) for i in range(n_cores)]
inA_fifos = []
outC_fifos = []
# AIE-array data movement with object fifos
# Input A
inA = object_fifo("inA", ShimTile, MemTile, buffer_depth, A_memTile_ty)
for i in range(n_cores):
inA_fifos.append(
object_fifo(f"memA{i}", MemTile, cores[i], buffer_depth, A_ty)
)
if n_cores > 1:
of_offsets = [
(np.prod(np_ndarray_type_get_shape(A_memTile_ty)) // n_cores) * i
for i in range(n_cores)
]
else:
of_offsets = []
object_fifo_link(inA, inA_fifos, [], of_offsets)
# Output C
for i in range(n_cores):
outC_fifos.append(
object_fifo(f"memC{i}", cores[i], MemTile, buffer_depth, C_ty)
)
outC = object_fifo("outC", MemTile, ShimTile, buffer_depth, C_memTile_ty)
if n_cores > 1:
of_offsets = [
(np.prod(np_ndarray_type_get_shape(C_memTile_ty)) // n_cores) * i
for i in range(n_cores)
]
else:
of_offsets = []
object_fifo_link(outC_fifos, outC, of_offsets, [])
# Compute tile bodies
for i in range(n_cores):
# Compute tile i
@core(cores[i], "kernels.a")
def core_body():
for _ in range_(0xFFFFFFFF):
for _ in range_(tiles):
elem_out = outC_fifos[i].acquire(ObjectFifoPort.Produce, 1)
elem_in_a = inA_fifos[i].acquire(ObjectFifoPort.Consume, 1)
exp_bf16_1024(elem_in_a, elem_out)
inA_fifos[i].release(ObjectFifoPort.Consume, 1)
outC_fifos[i].release(ObjectFifoPort.Produce, 1)
# To/from AIE-array data movement
tensor_ty = np.ndarray[(N,), np.dtype[bfloat16]]
@runtime_sequence(tensor_ty, tensor_ty)
def sequence(A, C):
in_task = shim_dma_single_bd_task(
inA, A, sizes=[1, 1, 1, N], issue_token=True
)
out_task = shim_dma_single_bd_task(
outC, C, sizes=[1, 1, 1, N], issue_token=True
)
dma_start_task(in_task, out_task)
dma_await_task(in_task, out_task)
with mlir_mod_ctx() as ctx:
my_eltwise_exp()
res = ctx.module.operation.verify()
if res == True:
print(ctx.module)
else:
print(res)