forked from Xilinx/mlir-aie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_exp.py
75 lines (60 loc) · 2.34 KB
/
vector_exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# vector_exp/vector_exp.py -*- Python -*-
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
import numpy as np
from ml_dtypes import bfloat16
from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
from aie.iron.placers import SequentialPlacer
from aie.iron.device import NPU1Col1
from aie.iron.controlflow import range_
def my_eltwise_exp():
N = 65536
# Tile sizes
n = 1024
N_div_n = N // n
n_cores = 4
tiles = N_div_n // n_cores
# Define tensor types
tensor_ty = np.ndarray[(N,), np.dtype[bfloat16]]
memtile_ty = np.ndarray[(n * n_cores,), np.dtype[bfloat16]]
tile_ty = np.ndarray[(n,), np.dtype[bfloat16]]
# Generate handle to externally defined kernel function
exp_bf16_1024 = Kernel("exp_bf16_1024", "kernels.a", [tile_ty, tile_ty])
# Dataflow with ObjectFifos
A_fifo = ObjectFifo(memtile_ty, name="inA")
C_fifo = ObjectFifo(memtile_ty, name="outC")
a_fifos = A_fifo.cons().split(
offsets=[n * i for i in range(n_cores)], obj_types=[tile_ty] * n_cores
)
c_fifos = C_fifo.prod().join(
offsets=[n * i for i in range(n_cores)], obj_types=[tile_ty] * n_cores
)
# Define a task a core might perform
def core_fn(a_in, c_out, exp_bf16_1024):
for _ in range_(tiles):
elem_out = c_out.acquire(1)
elem_in_a = a_in.acquire(1)
exp_bf16_1024(elem_in_a, elem_out)
a_in.release(1)
c_out.release(1)
# Create workers to run the tasks (one per core)
workers = []
for i in range(n_cores):
workers.append(
Worker(
core_fn, fn_args=[a_fifos[i].cons(), c_fifos[i].prod(), exp_bf16_1024]
)
)
# Runtime operations to move data to/from the AIE-array
rt = Runtime()
with rt.sequence(tensor_ty, tensor_ty) as (a_in, c_out):
rt.start(*workers)
rt.fill(A_fifo.prod(), a_in)
rt.drain(C_fifo.cons(), c_out, wait=True)
# Place program components (assign them resources on the device) and generate an MLIR module
return Program(NPU1Col1(), rt).resolve_program(SequentialPlacer())
print(my_eltwise_exp())