From 87a632b9d12a4ce745dff083792112f9373b2616 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Fri, 13 Sep 2024 15:58:46 -0600
Subject: [PATCH 1/7] add example with run-time-parametrized matmul

---
 .../basic/matrix_multiplication/rtp/Makefile  | 113 +++
 .../basic/matrix_multiplication/rtp/aie2.py   | 717 ++++++++++++++++++
 .../matrix_multiplication/rtp/await_rtp.cc    |  38 +
 .../basic/matrix_multiplication/rtp/kernel.cc | 192 +++++
 .../basic/matrix_multiplication/rtp/test.cpp  | 399 ++++++++++
 .../basic/matrix_multiplication/rtp/util.py   |  52 ++
 .../basic/matrix_multiplication/rtp/zero.cc   |  40 +
 7 files changed, 1551 insertions(+)
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/Makefile
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/aie2.py
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/kernel.cc
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/test.cpp
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/util.py
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/zero.cc

diff --git a/programming_examples/basic/matrix_multiplication/rtp/Makefile b/programming_examples/basic/matrix_multiplication/rtp/Makefile
new file mode 100644
index 0000000000..d6cdf95c4b
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/Makefile
@@ -0,0 +1,113 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=/tools/Xilinx/Vitis/2023.2
+BOOST_ROOT?=/usr/include/boost
+Boost_INCLUDE_DIRS=${BOOST_ROOT}
+
+# ---
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CXX=g++-13
+
+CXXFLAGS+=-std=gnu++23 -ggdb -I${XILINX_XRT_INCLUDE} ${Boost_INCLUDE_DIRS:%=-I%}
+LDFLAGS+=-L${XILINX_XRT_LIB} ${Boost_LIBRARY_DIRS:%=-L%}
+LDLIBS+=-lxrt_coreutil -lboost_program_options -lboost_filesystem -luuid
+
+CHESSCCWRAP2_FLAGS = aie2 -I${XILINX_VITIS}/aietools/include 
+
+mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+kernel_target?=build/mm_${m}x${k}x${n}.o
+
+M=256
+K=3072
+N=50304
+m=64
+k=64
+n=32
+n_aie_cols=4
+dtype_in=bf16
+dtype_out=f32
+
+aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols} --dtype_in ${dtype_in} --dtype_out ${dtype_out}
+
+.PHONY: all
+all: test ${xclbin_target} ${host_target} \
+	build/insts_256x768x2304.txt \
+	build/insts_256x768x768.txt \
+	build/insts_256x768x3072.txt \
+	build/insts_256x3072x768.txt \
+	build/insts_768x256x3072.txt \
+	build/insts_3072x256x768.txt \
+	build/insts_768x256x768.txt \
+	build/insts_256x2304x768.txt \
+	build/insts_2304x256x768.txt
+
+${kernel_target}: kernel.cc await_rtp.cc zero.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DDIM_M=$m -DDIM_K=$k -DDIM_N=$n -DBIT_WIDTH=8 -c ${<:%=../%} -o ${@F}
+
+${xclbin_target}: build/aie_256x768x2304.mlir ${kernel_target} 
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F}  $(<:%=../%)
+
+build/aie_256x768x2304.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 256 -K 768 -N 2304 $(aieargs) > $@
+build/aie_256x768x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 256 -K 768 -N 768 $(aieargs) > $@
+build/aie_256x768x3072.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 256 -K 768 -N 3072 $(aieargs) > $@
+build/aie_256x3072x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 256 -K 3072 -N 768 $(aieargs) > $@
+build/aie_768x256x3072.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 768 -K 256 -N 3072 $(aieargs) > $@
+build/aie_3072x256x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 3072 -K 256 -N 768 $(aieargs) > $@
+build/aie_768x256x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 768 -K 256 -N 768 $(aieargs) > $@
+build/aie_256x2304x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 256 -K 2304 -N 768 $(aieargs) > $@
+build/aie_2304x256x768.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< -M 2304 -K 256 -N 768 $(aieargs) > $@
+
+build/insts_%.txt: build/aie_%.mlir 
+	cd ${@D} && aiecc.py -v --aie-only-generate-npu --npu-insts-name=${@:build/%=%} $(<:%=../%)
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	@rm -r build
+
+test: test.cpp 
+	${CXX} ${CXXFLAGS} $^ -o $@ ${LDFLAGS} ${LDLIBS}
+
+.PHONY: run
+run: all
+	./test
diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
new file mode 100644
index 0000000000..c998a4c6ad
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -0,0 +1,717 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+import argparse
+
+from aie.extras.context import mlir_mod_ctx
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+import aie.dialects.index as index_dialect
+import aie.dialects.arith as arith_dialect
+
+from util import *
+
+def get_memref_len_elems(memref):
+    out = 1 
+    for s in memref.shape:
+        out *= s
+    return out
+
+
+def main():
+    argparser = argparse.ArgumentParser(
+        prog="AIE Matrix Multiplication MLIR Design (Whole Array)",
+        description="Emits MLIR code for a matrix multiplication design of the given input size",
+    )
+    argparser.add_argument("-M", type=int, default=512)
+    argparser.add_argument("-K", type=int, default=512)
+    argparser.add_argument("-N", type=int, default=512)
+    argparser.add_argument("-m", type=int, default=64)
+    argparser.add_argument("-k", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=32)
+    argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4], default=4)
+    argparser.add_argument(
+        "--dtype_in", type=str, choices=["bf16", "i16"], default="i16"
+    )
+    argparser.add_argument(
+        "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i16"
+    )
+    args = argparser.parse_args()
+    with mlir_mod_ctx() as ctx:
+        my_matmul(
+            args.M,
+            args.K,
+            args.N,
+            args.m,
+            args.k,
+            args.n,
+            args.n_aie_cols,
+            args.dtype_in,
+            args.dtype_out,
+        )
+        # print(ctx.module.operation.verify())
+        print(ctx.module)
+
+
+def ceildiv(a, b):
+    return (a + b - 1) // b
+
+
+def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str):
+
+    n_aie_rows = 4
+    n_aie_cores = n_aie_rows * n_aie_cols
+
+    dtype_in = None
+    if dtype_in_str == "bf16":
+        dtype_in = T.bf16
+    elif dtype_in_str == "i16":
+        dtype_in = T.i16
+    dtype_out = None
+    if dtype_out_str == "bf16":
+        dtype_out = T.bf16
+    elif dtype_out_str == "i16":
+        dtype_out = T.i16
+    elif dtype_out_str == "f32":
+        dtype_out = T.f32
+    elif dtype_out_str == "i32":
+        dtype_out = T.i32
+    
+    assert dtype_in == T.bf16
+    assert dtype_out == T.f32
+
+    if dtype_in_str == "bf16":
+        r = 4
+        s = 8
+        t = 4
+    elif dtype_in_str == "i16":
+        r = 4
+        s = 4
+        t = 4
+
+    # Input matrix A:
+    # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These
+    # blocks are _broadcast_ across AIE core columns, then _distributed_ across
+    # rows, s.t. each of the n_rows compute cores in a column receives a
+    # contiguous (m, k)-sized block of A.
+    assert (
+        M % (m * n_aie_rows) == 0
+    ), """A must be tileable into (m * n_aie_rows, k)-sized blocks"""
+
+    # Both A and B are tiled in the K dimension into size k.
+    assert K % k == 0
+
+    # Input matrix B:
+    # Conceptually, we do the same as with A, but instead of broadcasting
+    # across columns we broadcast across rows and distribute across columns.
+    assert (
+        N % (n * n_aie_cols) == 0
+    ), """B must be tileable into (k, n * n_aie_cols)-sized blocks"""
+
+    # r, s, t are the dimensions required by the microkernel MAC instructions.
+    assert m % r == 0
+    assert k % s == 0
+    assert n % t == 0
+
+    # If you get errors during CDO generation due to running out of program
+    # memory, it may be because too much code is generated due to ObjectFIFO
+    # loop unrollings. Reducing the depth to 1 here will work around that at
+    # a big performance cost.
+    fifo_depth = 2
+
+    n_tiles_per_core = (M // m) * (N // n) // n_aie_cores
+
+    n_A_tiles_per_shim = n_aie_rows // n_aie_cols
+
+    dev = None
+    if n_aie_cols == 1:
+        dev = AIEDevice.npu1_1col
+    elif n_aie_cols == 2:
+        dev = AIEDevice.npu1_2col
+    elif n_aie_cols == 4:
+        dev = AIEDevice.npu1_4col
+
+    @device(dev)
+    def device_body():
+        A_l3_memref_ty = T.memref(M * K, dtype_in())
+        B_l3_memref_ty = T.memref(K * N, dtype_in())
+        C_l3_memref_ty = T.memref(M * N, dtype_out())
+        A_l2_memref_ty = T.memref(m * k * n_A_tiles_per_shim, dtype_in())
+        B_l2_memref_ty = T.memref(k * n, dtype_in())
+        C_l2_memref_ty = T.memref(m * n * n_aie_rows, dtype_out())
+        A_l1_memref_ty = T.memref(m, k, dtype_in())
+        B_l1_memref_ty = T.memref(k, n, dtype_in())
+        C_l1_memref_ty = T.memref(m, n, dtype_out())
+        rtp_ty = T.memref(3, T.i32())
+
+        # AIE Core Function declarations
+        zero_scalar = external_func(
+            f"zero_scalar_{dtype_out_str}", inputs=[C_l1_memref_ty]
+        )
+        zero = external_func(f"zero_{dtype_out_str}", inputs=[C_l1_memref_ty])
+        matmul_scalar = external_func(
+            f"matmul_scalar_{dtype_in_str}_{dtype_out_str}",
+            inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
+        )
+        matmul = external_func(
+            f"matmul_{dtype_in_str}_{dtype_out_str}",
+            inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
+        )
+        await_rtp = external_func(
+            f"await_rtp",
+            inputs=[rtp_ty]
+        )
+        get_volatile_rtp = external_func(
+            f"get_volatile_rtp",
+            inputs=[rtp_ty, T.i32()],
+            outputs=[T.i32()]
+        )
+
+        # Tile declarations as tile[row][col]
+        tiles = [
+            [tile(col, row) for col in range(0, n_aie_cols)] for row in range(0, 6)
+        ]
+        shim_tiles = tiles[0]
+        mem_tiles = tiles[1]
+        core_tiles = tiles[2:]
+
+        # Run time parameter K//k
+        rtp_bufs = [[None] * n_aie_cols for _ in range(4)]
+        for col in range(n_aie_cols):
+            for row in range(n_aie_rows):
+                # RTP index 0: "ready" signal
+                # RTP index 1: K // k // 2
+                rtp_bufs[row][col] = buffer(core_tiles[row][col], (3,), T.i32(), f"rtp_{row}_{col}")
+
+        # AIE-array data movement with object fifos
+        A_l3l2_fifos = [None] * n_aie_cols
+        A_l2l1_fifos = [None] * n_aie_rows
+
+        B_l3l2_fifos = [None] * n_aie_cols
+        B_l2l1_fifos = [None] * n_aie_cols
+
+        C_l1l2_fifos = [[None] * n_aie_cols for _ in range(n_aie_rows)]
+        C_l2l3_fifos = [None] * n_aie_cols
+
+        # Input A, L2 -> L1
+        for row in range(n_aie_rows):
+            mem_tile = mem_tiles[row // n_A_tiles_per_shim]
+            A_l2l1_fifos[row] = {
+                "prod" : {
+                    "endpoint": (mem_tile, WireBundle.DMA, 0),
+                    "ping_buf": buffer(mem_tile, A_l2_memref_ty.shape, dtype_in(), name=f"A_L3L2_{row}_cons_buff_0"),
+                    "pong_buf": buffer(mem_tile, A_l2_memref_ty.shape, dtype_in(), name=f"A_L3L2_{row}_cons_buff_1"),
+                    "put_lock": lock(mem_tile, init=2, sym_name=f"A_L3L2_{row}_cons_prod_lock", lock_id=0),
+                    "get_lock": lock(mem_tile, init=0, sym_name=f"A_L3L2_{row}_cons_cons_lock", lock_id=1)
+                },
+                "cons" : [
+                    {  
+                        "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
+                        "ping_buf": buffer(core_tiles[row][col], A_l1_memref_ty.shape, dtype_in(), name=f"A_L2L1_{row}_{col}_cons_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], A_l1_memref_ty.shape, dtype_in(), name=f"A_L2L1_{row}_{col}_cons_buff_1"),
+                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"A_L2L1_{row}_{col}_cons_prod_lock", lock_id=0),
+                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"A_L2L1_{row}_{col}_cons_cons_lock", lock_id=1)
+                    }
+                    for col in range(n_aie_cols)
+                ]  # broadcast along one row
+            }
+            for col in range(n_aie_cols):
+                src_tile, src_bundle, src_channel = A_l2l1_fifos[row]["prod"]["endpoint"]
+                dst_tile, dst_bundle, dst_channel = A_l2l1_fifos[row]["cons"][col]["endpoint"]
+                flow(src_tile, src_bundle, src_channel,
+                     dst_tile, dst_bundle, dst_channel)
+
+        # Input A, L3 -> L2
+        for col in range(n_aie_cols):
+            shim_tile = shim_tiles[col]
+            mem_tile = mem_tiles[col]
+            A_l3l2_fifos[col] = {
+                "prod" : {
+                    "endpoint": (shim_tile, WireBundle.DMA, 0),
+                    "shim_memref": memref.global_(sym_name=f"A_L3L2_{col}", sym_visibility="public", type_=A_l3_memref_ty),
+                    "shim_dma_alloc": ShimDMAAllocationOp(f"A_L3L2_{col}", DMAChannelDir.MM2S, 0, col=col)
+                },
+                "cons" : {
+                    "endpoint": (mem_tile, WireBundle.DMA, 0),
+                }
+            }
+            src_tile, src_bundle, src_channel = A_l3l2_fifos[col]["prod"]["endpoint"]
+            dst_tile, dst_bundle, dst_channel = A_l3l2_fifos[col]["cons"]["endpoint"]
+            flow(src_tile, src_bundle, src_channel,
+                    dst_tile, dst_bundle, dst_channel)
+
+        # Input B, L2 -> L1
+        for col in range(n_aie_cols):
+            mem_tile = mem_tiles[col]
+            B_l2l1_fifos[col] = {
+                "prod" : {
+                    "endpoint": (mem_tile, WireBundle.DMA, 1),
+                    "ping_buf": buffer(mem_tile, B_l2_memref_ty.shape, dtype_in(), name=f"B_L3L2_{col}_cons_buff_0"),
+                    "pong_buf": buffer(mem_tile, B_l2_memref_ty.shape, dtype_in(), name=f"B_L3L2_{col}_cons_buff_1"),
+                    "put_lock": lock(mem_tile, init=2, sym_name=f"B_L3L2_{col}_cons_prod_lock", lock_id=2),
+                    "get_lock": lock(mem_tile, init=0, sym_name=f"B_L3L2_{col}_cons_cons_lock", lock_id=3)
+                },
+                "cons" : [
+                    {  
+                        "endpoint": (core_tiles[row][col], WireBundle.DMA, 1),
+                        "ping_buf": buffer(core_tiles[row][col], B_l1_memref_ty.shape, dtype_in(), name=f"B_L2L1_{col}_{row}_cons_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], B_l1_memref_ty.shape, dtype_in(), name=f"B_L2L1_{col}_{row}_cons_buff_1"),
+                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"B_L2L1_{col}_{row}_cons_prod_lock", lock_id=2),
+                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"B_L2L1_{col}_{row}_cons_cons_lock", lock_id=3)
+                    }
+                    for row in range(n_aie_rows)
+                ]  # broadcast along one column
+            }
+            for row in range(n_aie_rows):
+                src_tile, src_bundle, src_channel = B_l2l1_fifos[col]["prod"]["endpoint"]
+                dst_tile, dst_bundle, dst_channel = B_l2l1_fifos[col]["cons"][row]["endpoint"]
+                flow(src_tile, src_bundle, src_channel,
+                        dst_tile, dst_bundle, dst_channel)
+        
+        # Input B, L3 -> L2
+        for col in range(n_aie_cols):
+            mem_tile = mem_tiles[col]
+            shim_tile = shim_tiles[col]
+            B_l3l2_fifos[col] = {
+                "prod" : {
+                    "endpoint": (shim_tile, WireBundle.DMA, 1),
+                    "shim_memref": memref.global_(sym_name=f"B_L3L2_{col}", sym_visibility="public", type_=B_l3_memref_ty),
+                    "shim_dma_alloc": ShimDMAAllocationOp(f"B_L3L2_{col}", DMAChannelDir.MM2S, 1, col=col)
+                },
+                "cons" : {
+                    "endpoint": (mem_tile, WireBundle.DMA, 1)
+                }
+            }
+            src_tile, src_bundle, src_channel = B_l3l2_fifos[col]["prod"]["endpoint"]
+            dst_tile, dst_bundle, dst_channel = B_l3l2_fifos[col]["cons"]["endpoint"]
+            flow(src_tile, src_bundle, src_channel,
+                 dst_tile, dst_bundle, dst_channel)
+
+        # Output C, L1 -> L2
+        for col in range(n_aie_cols):
+            for row in range(n_aie_rows):
+                C_l1l2_fifos[row][col] = {
+                    "prod" : {
+                        "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
+                        "ping_buf": buffer(core_tiles[row][col], C_l1_memref_ty.shape, dtype_out(), name=f"C_L1L2_{col}_{row}_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], C_l1_memref_ty.shape, dtype_out(), name=f"C_L1L2_{col}_{row}_buff_1"),
+                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"C_L1L2_{col}_{row}_prod_lock", lock_id=4),
+                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"C_L1L2_{col}_{row}_cons_lock", lock_id=5)
+                    },
+                    "cons" : {  
+                        "endpoint": (mem_tiles[col], WireBundle.DMA, 
+                                     row + 2  # S2MM channels 0, 1 on memtile are used for A, B coming in from shim
+                                     ),
+                    }
+                }
+                src_tile, src_bundle, src_channel = C_l1l2_fifos[row][col]["prod"]["endpoint"]
+                dst_tile, dst_bundle, dst_channel = C_l1l2_fifos[row][col]["cons"]["endpoint"]
+                flow(src_tile, src_bundle, src_channel,
+                     dst_tile, dst_bundle, dst_channel)
+
+        # Output C, L2 -> L3
+        for col in range(n_aie_cols):
+            C_l2l3_fifos[col] = {
+                "prod" : {
+                    "endpoint": (mem_tiles[col], WireBundle.DMA, 2),
+                    "ping_buf": buffer(mem_tiles[col], C_l2_memref_ty.shape, dtype_out(), name=f"C_L2L3_{col}_buff_0"),
+                    "pong_buf": buffer(mem_tiles[col], C_l2_memref_ty.shape, dtype_out(), name=f"C_L2L3_{col}_buff_1"),
+                    "put_lock": lock(mem_tiles[col], init=4*2, sym_name=f"C_L2L3_{col}_prod_lock", lock_id=4),
+                    "get_lock": lock(mem_tiles[col], init=0, sym_name=f"C_L2L3_{col}_cons_lock", lock_id=5)
+                },
+                "cons" : {  
+                    "endpoint": (shim_tiles[col], WireBundle.DMA, 0),
+                    "shim_memref": memref.global_(sym_name=f"C_L2L3_{col}", sym_visibility="public", type_=C_l3_memref_ty),
+                    "shim_dma_alloc": ShimDMAAllocationOp(f"C_L2L3_{col}", DMAChannelDir.S2MM, 0, col=col)
+                }
+            }
+            src_tile, src_bundle, src_channel = C_l2l3_fifos[col]["prod"]["endpoint"]
+            dst_tile, dst_bundle, dst_channel = C_l2l3_fifos[col]["cons"]["endpoint"]
+            flow(src_tile, src_bundle, src_channel,
+                 dst_tile, dst_bundle, dst_channel)
+
+        # Set up the data movement
+
+        # Mem tiles
+        for col in range(n_aie_cols):
+            @memtile_dma(mem_tiles[col])
+            def memtile_body(block):
+
+                # A input
+                A_l3l2_fifo = A_l3l2_fifos[col]["cons"]
+                A_l2l1_fifo = A_l2l1_fifos[col]["prod"]
+                _, _, a_in_channel = A_l3l2_fifo["endpoint"]
+                _ = block["a_in_ping"], block["a_in_pong"]
+                dma_start(DMAChannelDir.S2MM, a_in_channel, dest=block["a_in_ping"], chain=block["a_out"])
+                for pp in ["ping", "pong"]:
+                    with block[f"a_in_{pp}"]:
+                        use_lock(A_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                        dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l2_memref_ty))
+                        use_lock(A_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                        next_bd(block[f"a_in_{'pong' if pp == 'ping' else 'ping'}"])
+
+                # A output
+                with block["a_out"]:
+                    A_l2l1_fifo = A_l2l1_fifos[col]["prod"]
+                    _, _, a_out_channel = A_l2l1_fifo["endpoint"]
+                    _ = block["a_out_ping"], block["a_out_pong"]
+                    dma_start(DMAChannelDir.MM2S, a_out_channel, dest=block["a_out_ping"], chain=block["b_in"])
+                    for pp in ["ping", "pong"]:
+                        with block[f"a_out_{pp}"]:
+                            use_lock(A_l2l1_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            assert get_memref_len_elems(A_l1_memref_ty) == get_memref_len_elems(A_l2_memref_ty)
+                            dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l1_memref_ty),
+                                   dimensions=[
+                                                  (m // r, r * k),
+                                                  (k // s, s),
+                                                  (r, k),
+                                                  (s, 1),
+                                              ])
+                            use_lock(A_l2l1_fifo["put_lock"], LockAction.Release, value=1)
+                            next_bd(block[f"a_out_{'pong' if pp == 'ping' else 'ping'}"])
+
+                # B input
+                with block["b_in"]:
+                    B_l3l2_fifo = B_l3l2_fifos[col]["cons"]
+                    B_l2l1_fifo = B_l2l1_fifos[col]["prod"]
+                    _, _, b_in_channel = B_l3l2_fifo["endpoint"]
+                    _ = block["b_in_ping"], block["b_in_pong"]
+                    dma_start(DMAChannelDir.S2MM, b_in_channel, dest=block["b_in_ping"], chain=block["b_out"])
+                    for pp in ["ping", "pong"]:
+                        with block[f"b_in_{pp}"]:
+                            use_lock(B_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l2_memref_ty))
+                            use_lock(B_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                            next_bd(block[f"b_in_{'pong' if pp == 'ping' else 'ping'}"])
+                
+                # B output
+                with block["b_out"]:
+                    B_l2l1_fifo = B_l2l1_fifos[col]["prod"]
+                    _, _, b_out_channel = B_l2l1_fifo["endpoint"]
+                    _ = block["b_out_ping"], block["b_out_pong"]
+                    dma_start(DMAChannelDir.MM2S, b_out_channel, dest=block["b_out_ping"], chain=block["c_in_0"])
+                    for pp in ["ping", "pong"]:
+                        with block[f"b_out_{pp}"]:
+                            use_lock(B_l2l1_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            assert get_memref_len_elems(B_l2_memref_ty) == get_memref_len_elems(B_l1_memref_ty)
+                            dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l1_memref_ty),
+                                   dimensions=[
+                                                  (n // t, t * k),
+                                                  (k // s, s),
+                                                  (t, k),
+                                                  (s, 1),
+                                              ])
+                            use_lock(B_l2l1_fifo["put_lock"], LockAction.Release, value=1)
+                            next_bd(block[f"b_out_{'pong' if pp == 'ping' else 'ping'}"])
+                
+                # C input
+                for row in range(n_aie_rows):
+                    C_l2l3_fifo = C_l2l3_fifos[col]["prod"]
+                    with block[f"c_in_{row}"]:
+                        C_l1l2_fifo = C_l1l2_fifos[row][col]["cons"]
+                        _, _, c_in_channel = C_l1l2_fifo["endpoint"]
+                        _ = block[f"c_in_{row}_ping"], block[f"c_in_{row}_pong"]
+                        dma_start(DMAChannelDir.S2MM, c_in_channel, dest=block[f"c_in_{row}_ping"], 
+                                  chain=block[f"c_in_{row+1}" if row+1 < n_aie_rows else "c_out"])
+                        for pp in ["ping", "pong"]:
+                            with block[f"c_in_{row}_{pp}"]:
+                                use_lock(C_l2l3_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                dma_bd(C_l2l3_fifo[f"{pp}_buf"], 
+                                    offset=row * get_memref_len_elems(C_l1_memref_ty),
+                                    len=get_memref_len_elems(C_l1_memref_ty))
+                                use_lock(C_l2l3_fifo["get_lock"], LockAction.Release, value=1)
+                                next_bd(block[f"c_in_{row}_{'pong' if pp == 'ping' else 'ping'}"])
+
+                # C output
+                with block["c_out"]:
+                    _, _, c_out_channel = C_l2l3_fifo["endpoint"]
+                    _ = block["c_out_ping"], block["c_out_pong"]
+                    dma_start(DMAChannelDir.MM2S, c_out_channel, dest=block["c_out_ping"], chain=block["end"])
+                    for pp in ["ping", "pong"]:
+                        with block[f"c_out_{pp}"]:
+                            use_lock(C_l2l3_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=4)
+                            assert get_memref_len_elems(C_l2_memref_ty) == 4*get_memref_len_elems(C_l1_memref_ty)
+                            dma_bd(C_l2l3_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(C_l2_memref_ty),
+                                   dimensions=[
+                                                  (m // r, r * n),
+                                                  (r, t),
+                                                  (n // t, r * t),
+                                                  (t, 1),
+                                              ])
+                            use_lock(C_l2l3_fifo["put_lock"], LockAction.Release, value=4)
+                            next_bd(block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"])
+
+                with block["end"]:
+                    EndOp()
+
+        # core DMAs
+        for row in range(n_aie_rows):
+            for col in range(n_aie_cols):
+                @mem(core_tiles[row][col])
+                def core_mem_body(block):
+
+                    # A input
+                    A_l2l1_fifo = A_l2l1_fifos[row]["cons"][col]
+                    _, _, a_in_channel = A_l2l1_fifo["endpoint"]
+                    _ = block["a_in_ping"], block["a_in_pong"]
+                    dma_start(DMAChannelDir.S2MM, a_in_channel, dest=block["a_in_ping"], chain=block["b_in"])
+                    for pp in ["ping", "pong"]:
+                        with block[f"a_in_{pp}"]:
+                            use_lock(A_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l1_memref_ty))
+                            use_lock(A_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                            next_bd(block[f"a_in_{'pong' if pp == 'ping' else 'ping'}"])
+
+                    # B input
+                    with block["b_in"]:
+                        B_l2l1_fifo = B_l2l1_fifos[col]["cons"][row]
+                        _, _, b_in_channel = B_l2l1_fifo["endpoint"]
+                        _ = block["b_in_ping"], block["b_in_pong"]
+                        dma_start(DMAChannelDir.S2MM, b_in_channel, dest=block["b_in_ping"], chain=block["c_out"])
+                        for pp in ["ping", "pong"]:
+                            with block[f"b_in_{pp}"]:
+                                use_lock(B_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l1_memref_ty))
+                                use_lock(B_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                                next_bd(block[f"b_in_{'pong' if pp == 'ping' else 'ping'}"])
+                    
+                    # C output
+                    with block["c_out"]:
+                        C_l1l2_fifo = C_l1l2_fifos[row][col]["prod"]
+                        _, _, c_out_channel = C_l1l2_fifo["endpoint"]
+                        _ = block["c_out_ping"], block["c_out_pong"]
+                        dma_start(DMAChannelDir.MM2S, c_out_channel, dest=block["c_out_ping"], chain=block["end"])
+                        for pp in ["ping", "pong"]:
+                            with block[f"c_out_{pp}"]:
+                                use_lock(C_l1l2_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                dma_bd(C_l1l2_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(C_l1_memref_ty))
+                                use_lock(C_l1l2_fifo["put_lock"], LockAction.Release, value=1)
+                                next_bd(block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"])
+
+                    with block["end"]:
+                        EndOp()
+
+
+
+        # Set up compute tiles
+        for row in range(n_aie_rows):
+            for col in range(n_aie_cols):
+
+                @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o")
+                def core_body():
+                    C_fifo = C_l1l2_fifos[row][col]["prod"]
+                    A_fifo = A_l2l1_fifos[row]["cons"][col]
+                    B_fifo = B_l2l1_fifos[col]["cons"][row]
+
+                    c_0 = index_dialect.constant(0)
+                    c_1 = index_dialect.constant(1)
+                    c_2 = index_dialect.constant(2)
+                    c_maxint = index_dialect.constant(0xFFFFFFFF)
+                    
+                    run_loop = ForOp(lower_bound=c_0,
+                                     upper_bound=c_maxint,
+                                     step=c_1,
+                                     iter_args=[c_0])
+                    with InsertionPoint(run_loop.body):
+                        c_pp_outer = run_loop.inner_iter_args[0]
+
+                        # Wait for "ready" signal through RTP and read RTP.
+                        call(await_rtp, [rtp_bufs[row][col]])
+                        rtp_K_div_k_div_2_i32 = call(get_volatile_rtp, [rtp_bufs[row][col], 1])
+                        rtp_K_div_k_div_2 = index_dialect.castu(T.index(), rtp_K_div_k_div_2_i32)
+                        rtp_n_tiles_per_core_i32 = call(get_volatile_rtp, [rtp_bufs[row][col], 2])
+                        rtp_n_tiles_per_core = index_dialect.castu(T.index(), rtp_n_tiles_per_core_i32)
+
+                        tile_loop = for_(rtp_n_tiles_per_core, iter_args=[T.index()])
+                        tile_loop = ForOp(lower_bound=c_0,
+                                          upper_bound=rtp_n_tiles_per_core,
+                                          step=c_1,
+                                          iter_args=[c_pp_outer])
+                        with InsertionPoint(tile_loop.body):
+                            c_pp_inner = tile_loop.inner_iter_args[0]  # this variable flips between 0 and 1 each iteration
+                            c_pp_cond = index_dialect.cmp('eq', c_pp_inner, c_0)
+                            ifop = IfOp(c_pp_cond, [C_l1_memref_ty], hasElse=True)
+                            #ifop.thenRegion.blocks.append()
+                            with InsertionPoint(ifop.thenRegion.blocks[0]):
+                                yield_([C_fifo["ping_buf"]])
+                            #ifop.elseRegion.blocks.append()
+                            with InsertionPoint(ifop.elseRegion.blocks[0]):
+                                yield_([C_fifo["pong_buf"]])
+
+                            use_lock(C_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            elem_out = ifop.results_[0]
+                            call(zero, [elem_out])
+                            for j in for_(rtp_K_div_k_div_2):
+                                for ab_pp in ["ping", "pong"]:
+                                    use_lock(A_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                    use_lock(B_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                    elem_in_a = A_fifo[f"{ab_pp}_buf"]
+                                    elem_in_b = B_fifo[f"{ab_pp}_buf"]
+                                    call(matmul, [elem_in_a, elem_in_b, elem_out])
+                                    use_lock(A_fifo["put_lock"], LockAction.Release, value=1)
+                                    use_lock(B_fifo["put_lock"], LockAction.Release, value=1)
+                                yield_([])
+                            use_lock(C_fifo["get_lock"], LockAction.Release, value=1)
+
+                            c_pp_inner_plus = index_dialect.add(c_pp_inner, c_1)
+                            c_pp_inner_next = index_dialect.rems(c_pp_inner_plus, c_2)
+                            yield_([c_pp_inner_next])
+
+                        yield_([tile_loop.results_[0]])
+
+        # To/from AIE-array data movement
+        @runtime_sequence(
+            A_l3_memref_ty,
+            B_l3_memref_ty,
+            C_l3_memref_ty
+        )
+        def sequence(A, B, C):
+            # Write number of inner loop iterations for cores to use as run-time parameter.
+            # This allows for processing different problem sizes by only swapping the insts.txt.
+            assert (K//k)%2 == 0
+            rtp_K_div_k_div_2 = K//k//2
+            for row in range(n_aie_rows):
+                for col in range(n_aie_cols):
+                    sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name()[1:])
+                    npu_rtp_write(sym_ref, 1, rtp_K_div_k_div_2)
+                    npu_rtp_write(sym_ref, 2, n_tiles_per_core)
+                    npu_rtp_write(sym_ref, 0, 1)  # indicate "ready"
+
+            # We are limited in the number of BDs. After synchronizing, we can reuse BDs.
+            # We only transfer 6 rows of tiles at once before starting a new transfer block.
+            tb_max_n_rows = (
+                4  # tb = transfer block; block of transfers before sync call
+            )
+            for tb in range(ceildiv(M // m // n_aie_rows, tb_max_n_rows)):
+                for pingpong in [0, 1]:
+                    M // m // n_aie_rows // tb_max_n_rows
+                    row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2
+                    bd_id_base = 8 * pingpong
+                    tb_n_rows = min(
+                        [tb_max_n_rows // 2, M // m // n_aie_rows - row_base]
+                    )
+                    if tb_n_rows <= 0:
+                        # for small input sizes, we may not even need a "pong" iteration
+                        break
+                    for col in range(n_aie_cols):
+
+                        # C Output Transfer:
+                        # The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
+                        # Transfer one such tile for every (n_aie_cols)-th column, evenly spaced,
+                        # then repeat that (tb_n_rows) times for the next contiguous blocks of rows.
+                        # Each shim will start at a different column offset, transferring interleaved
+                        # columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1
+                        # may transfer the blocks marked 1.
+                        #
+                        #             N
+                        #      ----------------
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        #     |0011    0011    |
+                        # M   |0011    0011    |
+                        #     |                |
+                        #     |                |
+                        #     |                |
+                        #     |                |
+                        #      ----------------
+                        C_row_offset = row_base * m * n_aie_rows * N
+                        C_col_offset = col * n
+                        C_offset = C_col_offset + C_row_offset
+                        npu_dma_memcpy_nd(
+                            metadata=C_l2l3_fifos[col]["cons"]["shim_dma_alloc"].sym_name.value,
+                            bd_id=bd_id_base,
+                            mem=C,
+                            offsets=[0, 0, 0, C_offset],
+                            sizes=[tb_n_rows, N // n // n_aie_cols, m * n_aie_rows, n],
+                            strides=[m * n_aie_rows * N, n * n_aie_cols, N, 1],
+                        )
+
+                        for tile_row in range(tb_n_rows):
+
+                            # A input transfer:
+                            #
+                            # The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix.
+                            # Transfer one such tile for every column, contiguously.
+                            # Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times.
+                            # Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the
+                            # tiles marked 0 below, and shim 1 may transfer the tiles marked 1.
+                            #             K
+                            #      ----------------
+                            #     |0000000000000000|    (repeated N//n//n_aie_cols times)
+                            #     |0000000000000000|
+                            #     |1111111111111111|
+                            # M   |1111111111111111|
+                            #     |                |
+                            #     |                |
+                            #     |                |
+                            #     |                |
+                            #      ----------------
+                            A_block_offset = (
+                                (row_base + tile_row) * n_aie_rows * m * K
+                            )  # base address for this transfer block for all BDs
+                            A_row_offset = (
+                                col * n_A_tiles_per_shim * m * K
+                            )  # base address for the shim in this column
+                            A_offset = A_block_offset + A_row_offset
+                            npu_dma_memcpy_nd(
+                                metadata=A_l3l2_fifos[col]["prod"]["shim_dma_alloc"].sym_name.value,
+                                bd_id=bd_id_base + 2 * tile_row + 1,
+                                mem=A,
+                                offsets=[0, 0, 0, A_offset],
+                                sizes=[
+                                    N // n // n_aie_cols,
+                                    K // k,
+                                    m * n_A_tiles_per_shim,
+                                    k,
+                                ],
+                                strides=[0, k, K, 1],
+                            )
+
+                            # B input transfer:
+                            # Transfer the first a (n)-wide block of columns of B,
+                            # Then transfer the (n_aie_columns)-th such block, and so on.
+                            # Each shim will start at a different column offset.
+                            # For example, shim 0 may transfer the tiles marked 0 below,
+                            # and shim 1 may transfer the tiles marked 1.
+                            #
+                            #             N
+                            #      ----------------
+                            #     |0011    0011    |
+                            #     |0011    0011    |
+                            #     |0011    0011    |
+                            # K   |0011    0011    |
+                            #     |0011    0011    |
+                            #     |0011    0011    |
+                            #     |0011    0011    |
+                            #     |0011    0011    |
+                            #      ----------------
+                            B_col_offset = col * n * K
+                            npu_dma_memcpy_nd(
+                                metadata=B_l3l2_fifos[col]["prod"]["shim_dma_alloc"].sym_name.value,
+                                bd_id=bd_id_base + 2 * tile_row + 2,
+                                mem=B,
+                                offsets=[0, 0, 0, B_col_offset],
+                                sizes=[N // n // n_aie_cols,    K // k,   n,  k],
+                                strides=[n * n_aie_cols * K,         k,   K,  1],
+                            )
+                    if tb > 0 or (tb == 0 and pingpong > 0):
+                        for col in range(n_aie_cols):
+                            npu_sync(
+                                column=col, row=0, direction=0, channel=0
+                            )  # C done
+            for col in range(n_aie_cols):
+                npu_sync(column=col, row=0, direction=0, channel=0)
+
+
+if __name__ == "__main__":
+    main()
+else:
+    print("Not meant to be imported")
+    sys.exit(1)
diff --git a/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc b/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
new file mode 100644
index 0000000000..cce1d5a005
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
@@ -0,0 +1,38 @@
+//===- await_rtp.cc ---------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AWAIT_RTP_CC
+#define AWAIT_RTP_CC
+
+extern "C" {
+/* Polls a run-time parameter to be set to a value other than -1.
+   
+   Dedicate one RTP as the "ready" signal. Once you have set all other RTPs,
+   set this RTP to 1, and this function will unblock. In the core, you can
+   then read the other RTPs *after* this function unblocks.
+
+   Note: There is a small race condition here if the host sets the "ready"
+   RTP *before* the core calls this function. This is unlikely to happen if
+   this function is the first thing called in the core, as the core 
+   executes much faster than the host controller can set values in core 
+   memory.
+   */
+void await_rtp(volatile int *rtp) {
+  rtp[0] = -1;
+  while(rtp[0] == -1);
+}
+
+int get_volatile_rtp(volatile int *rtp, int index) {
+  return rtp[index];
+}
+
+}
+
+#endif
\ No newline at end of file
diff --git a/programming_examples/basic/matrix_multiplication/rtp/kernel.cc b/programming_examples/basic/matrix_multiplication/rtp/kernel.cc
new file mode 100644
index 0000000000..905556eece
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/kernel.cc
@@ -0,0 +1,192 @@
+//===- mm.cc ----------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define __AIENGINE__ 2
+#define NOCPP
+#define __AIEARCH__ 20
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+#include "zero.cc"
+#include "await_rtp.cc"
+
+template <typename T_in, typename T_out, int rowA, int colA, int colB>
+void matmul_scalar(T_in *a, T_in *b, T_out *c) {
+  event0();
+  for (int row = 0; row < rowA; row++) {
+    for (int col = 0; col < colB; col++) {
+      T_out running_sum = 0;
+      for (int i = 0; i < colA; i++) {
+        running_sum += a[row * colA + i] * b[i * colB + col];
+      }
+      c[row * colB + col] += running_sum;
+    }
+  }
+  event1();
+}
+
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
+                       T_out *__restrict pC) {
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;
+
+  event0();
+
+  // For int16 (4x4x4), this implementation iterates over the output space in
+  // steps of 4x4 tiles; each iteration makes an r*s, s*t and r*t step in the
+  // input and output space, respectively. The data layout expected is such
+  // that each r*s/s*t/r*t tile's elements are laid out contiguously in
+  // row-major order, and tiles themselves are organized in row-major
+  // order. For example, for 4x4x4 tiles, this means that an element in
+  // row 1, column 0 would be stored at offset 4 (since the first 4x4 tile
+  // is laid out contiguously in row-major). An element in row 0, column 4
+  // would be stored at offset 16 in the same example.
+
+  for (unsigned z = 0; z < rowA; z += 2)
+    chess_loop_range(2, ) {
+      T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 2)
+        // chess_loop_range(2, ) {
+        chess_prepare_for_pipelining chess_loop_range(8, ) {
+          const T_in *__restrict pA1 = pA + (z * colA) * MMUL::size_A;
+          const T_in *__restrict pA2 = pA + ((z + 1) * colA) * MMUL::size_A;
+          const T_in *__restrict pB1 = pB + (j * colA) * MMUL::size_B;
+          const T_in *__restrict pB2 = pB + ((j + 1) * colA) * MMUL::size_B;
+
+          aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
+          pA1 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
+          pA2 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_B> B0 = aie::transpose(aie::load_v<MMUL::size_B>(pB1), t, s);
+          pB1 += MMUL::size_B;
+          aie::vector<T_in, MMUL::size_B> B1 = aie::transpose(aie::load_v<MMUL::size_B>(pB2), t, s);
+          pB2 += MMUL::size_B;
+
+          // We modify the library documentation implementation to accumulate
+          // in the C dimension, since this vectorized kernel will be called
+          // multiple times as we further tile the input at a higher level.
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+
+          C00.mac(A0, B0);
+          C01.mac(A0, B1);
+          C10.mac(A1, B0);
+          C11.mac(A1, B1);
+
+          for (unsigned i = 1; i < colA; ++i)
+            chess_prepare_for_pipelining chess_loop_range(7, ) {
+              // chess_unroll_loop() {
+              A0 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += MMUL::size_A;
+              A1 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += MMUL::size_A;
+              B0 = aie::transpose(aie::load_v<MMUL::size_B>(pB1), t, s);
+              pB1 += MMUL::size_B;
+              B1 = aie::transpose(aie::load_v<MMUL::size_B>(pB2), t, s);
+              pB2 += MMUL::size_B;
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+        }
+    }
+
+  event1();
+}
+
+template <unsigned m, unsigned k, unsigned n>
+void matmul_vectorized_4x8x4_bf16_f32(const bfloat16 *__restrict pA,
+                                      const bfloat16 *__restrict pB,
+                                      float *__restrict pC) {
+  aie::set_rounding(aie::rounding_mode::conv_even);
+  constexpr int r = 4;
+  constexpr int s = 8;
+  constexpr int t = 4;
+  static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
+  static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
+  static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
+  return matmul_vectorized<bfloat16, float, m / r, k / s, n / t, r, s, t>(
+      pA, pB, pC);
+}
+
+extern "C" {
+
+// If you want to compile microkernels with different inner tile sizes,
+// define DIM_M, DIM_K and DIM_N at compile time using -DDIM_M 32 etc.
+// These dimensions must be divisible by the r, s, t dimensions used in
+// the kernels.
+
+#define combos(X)                                                              \
+  X(bfloat16, bf16, float, f32, 4, 8, 4)
+
+#define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,            \
+                                 mlir_type_out, r, s, t)                       \
+  void matmul_##mlir_type_in##_##mlir_type_out(ctype_in *a_in, ctype_in *b_in, \
+                                               ctype_out *c_out) {             \
+    matmul_vectorized_##r##x##s##x##t##_##mlir_type_in##_##mlir_type_out<      \
+        DIM_M, DIM_K, DIM_N>(a_in, b_in, c_out);                               \
+  }
+
+#define matmul_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out, \
+                             r, s, t)                                          \
+  void matmul_scalar_##mlir_type_in##_##mlir_type_out(                         \
+      ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
+    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N>(a_in, b_in,        \
+                                                            c_out);            \
+  }
+
+#define zero_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,              \
+                               mlir_type_out, r, s, t)                         \
+  void zero_##mlir_type_out(ctype_out *c_out) {                                \
+    zero_vectorized<ctype_out, DIM_M, DIM_N, 32>(c_out);                       \
+  }
+
+#define zero_scalar_c_func(ctype_in, mlir_type_in, ctype_out, mlir_type_out,   \
+                           r, s, t)                                            \
+  void zero_scalar_##mlir_type_out(ctype_out *c_out) {                         \
+    zero_scalar<ctype_out, DIM_M, DIM_N>(c_out);                               \
+  }
+
+combos(matmul_vectorized_c_func) combos(matmul_scalar_c_func)
+    combos(zero_vectorized_c_func) combos(zero_scalar_c_func)
+
+} // extern "C"
diff --git a/programming_examples/basic/matrix_multiplication/rtp/test.cpp b/programming_examples/basic/matrix_multiplication/rtp/test.cpp
new file mode 100644
index 0000000000..c54b19e4b6
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/test.cpp
@@ -0,0 +1,399 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdfloat>
+#include <tuple>
+#include <map>
+#include <fstream>
+#include <xrt/xrt_bo.h>
+#include <xrt/xrt_device.h>
+#include <xrt/xrt_kernel.h>
+
+
+#define VALIDATE 1
+
+#define AIEML_MAX_OFFLOAD_M 3072
+#define AIEML_MAX_OFFLOAD_K 3072
+#define AIEML_MAX_OFFLOAD_N 3072
+#define AIE_MAX_INSTR_LEN 4096
+
+
+#include "../common.h"
+
+
+// --------------------------------------------------------------------------
+// AIE initialization stuff
+// The following structures and functions are used to initialize multiple
+// AIE kernels. We only switch the insts.txts between matmul sizes, so we
+// load all inst.txt ahead of time.
+// --------------------------------------------------------------------------
+
+#define AIE_MAX_N_BOS 4
+
+union aie_bo_map {
+    char *i8;
+    int *i32;
+    std::bfloat16_t *bf16;
+    float *f32;
+};
+
+enum aie_bo_dir { IN_ONLY, OUT_ONLY, IN_OUT };
+
+struct aie_bo {
+    xrt::bo *bo;
+    int group_id;
+    enum aie_bo_dir dir;
+    size_t len;  // in bytes
+    union aie_bo_map buf;
+};
+
+struct aie_global_state {
+    xrt::device *device;
+};
+
+struct aie_state {
+    std::string xclbin_path;
+    std::string kernel_name;
+    xrt::xclbin *xclbin;
+    xrt::hw_context *context;
+    xrt::kernel *kernel;
+    size_t n_bos;
+    struct aie_bo bos[AIE_MAX_N_BOS];
+    std::vector<uint32_t> *last_loaded_insts; // don't reload if they're the same
+    int instr_len;
+};
+
+struct aie_offload_gemm_info {
+    std::vector<uint32_t> *insts;
+};
+
+struct aie_global_state aie_global;
+struct aie_state aie_gemm;
+struct aie_state aie_bias;
+std::vector<uint32_t> aie_gemm_256x768x2304_insts;
+std::vector<uint32_t> aie_gemm_256x768x768_insts;
+std::vector<uint32_t> aie_gemm_256x768x3072_insts;
+std::vector<uint32_t> aie_gemm_256x3072x768_insts;
+std::vector<uint32_t> aie_gemm_256x768x50304_insts;
+std::vector<uint32_t> aie_gemm_256x50304x768_insts;
+std::vector<uint32_t> aie_gemm_768x256x3072_insts;
+std::vector<uint32_t> aie_gemm_3072x256x768_insts;
+std::vector<uint32_t> aie_gemm_768x256x768_insts;
+std::vector<uint32_t> aie_gemm_256x2304x768_insts;
+std::vector<uint32_t> aie_gemm_2304x256x768_insts;
+std::map<std::tuple<int, int, int>, struct aie_offload_gemm_info> aie_offload;
+
+std::vector<uint32_t> aie_load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+void aie_init_global() {
+    // Set up device
+    unsigned int device_index = 0;
+    aie_global.device = new xrt::device(device_index);
+}
+
+void aie_init_design(struct aie_state *aie_state) {
+    // Load xclbin
+    constexpr int verbosity = 1;
+    if (verbosity >= 1) { std::cout << "Loading xclbin: " << aie_state->xclbin_path << "\n"; }
+    aie_state->xclbin = new xrt::xclbin(aie_state->xclbin_path);
+    if (verbosity >= 1) { std::cout << "Kernel opcode: " << aie_state->kernel_name << "\n"; }
+    auto xkernels = aie_state->xclbin->get_kernels();
+    auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                                 [aie_state](xrt::xclbin::kernel &k) { return k.get_name().rfind(aie_state->kernel_name, 0) == 0; });
+    auto kernel_name = xkernel.get_name();
+    if (verbosity >= 1) { std::cout << "Registering xclbin: " << aie_state->xclbin_path << "\n"; }
+    aie_global.device->register_xclbin(*aie_state->xclbin);
+    if (verbosity >= 1) { std::cout << "Getting hardware context.\n"; }
+    if (verbosity >= 1) { std::cout << aie_state->xclbin->get_uuid().to_string() << std::endl; }
+    aie_state->context = new xrt::hw_context(*aie_global.device, aie_state->xclbin->get_uuid());
+    if (verbosity >= 1) { std::cout << "Getting handle to kernel:" << kernel_name << "\n"; }
+    aie_state->kernel = new xrt::kernel(*aie_state->context, kernel_name);
+
+    assert(aie_state->n_bos >= 1 && aie_state->n_bos <= AIE_MAX_N_BOS); // buffer 1 is insts buffer
+    aie_state->bos[0].len = AIE_MAX_INSTR_LEN * sizeof(int);
+    aie_state->bos[0].group_id = 1;
+    aie_state->bos[0].bo = new xrt::bo(*aie_global.device, aie_state->bos[0].len, XCL_BO_FLAGS_CACHEABLE, aie_state->kernel->group_id(aie_state->bos[0].group_id));
+    aie_state->bos[0].buf.i32 = aie_state->bos[0].bo->map<int *>();
+
+    for(int i = 1; i < aie_state->n_bos; i++) {
+        aie_state->bos[i].group_id = i + 2;  // 1 is insts, 2 is insts_len, other buffers start at 3
+        aie_state->bos[i].bo = new xrt::bo(*aie_global.device, aie_state->bos[i].len, XRT_BO_FLAGS_HOST_ONLY, aie_state->kernel->group_id(aie_state->bos[i].group_id));
+        aie_state->bos[i].buf.i8 = aie_state->bos[i].bo->map<char *>();
+    }
+}
+
+std::vector<uint32_t> load_insts(const char *insts_txt_path) {
+    // Load instructions
+    constexpr int verbosity = 0;
+    std::vector<uint32_t> instr_v = aie_load_instr_sequence(insts_txt_path);
+    if (verbosity >= 1) { std::cout << "Sequence instr count: " << instr_v.size() << "\n"; }
+    assert(instr_v.size() < AIE_MAX_INSTR_LEN);
+    return std::move(instr_v);
+}
+
+void aie_init_insts(struct aie_state *aie_state, std::vector<uint32_t> *instr_v) {
+    if(instr_v == aie_state->last_loaded_insts) {
+        return;
+    }
+    memset(aie_state->bos[0].buf.i8, 0, AIE_MAX_INSTR_LEN * sizeof(int));
+    memcpy(aie_state->bos[0].buf.i8, instr_v->data(), instr_v->size() * sizeof(int));
+    aie_state->bos[0].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    aie_state->instr_len = instr_v->size();
+}
+
+void aie_run_design(struct aie_state *aie_state) {
+    // bos[0] is synced in init function
+    for(int i = 1; i < aie_state->n_bos; i++) {
+        if(aie_state->bos[i].dir == OUT_ONLY) {
+            continue;
+        }
+        aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
+    unsigned int opcode = 3;
+    auto run = (*aie_state->kernel)(opcode, *aie_state->bos[0].bo, aie_state->instr_len, *aie_state->bos[1].bo, *aie_state->bos[2].bo, *aie_state->bos[3].bo);
+    ert_cmd_state r = run.wait();
+    if (r != ERT_CMD_STATE_COMPLETED) {
+        std::cout << "AIE Error Status: " << r << std::endl;
+        exit(1);
+    }
+    for(int i = 1; i < aie_state->n_bos; i++) {
+        if(aie_state->bos[i].dir == IN_ONLY) {
+            continue;
+        }
+        aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    }
+}
+
+void aie_init() {
+    aie_init_global();
+
+    // GEMM design
+    aie_gemm_256x768x2304_insts = load_insts("build/insts_256x768x2304.txt");
+    aie_gemm_256x768x768_insts = load_insts("build/insts_256x768x768.txt");
+    aie_gemm_256x768x3072_insts = load_insts("build/insts_256x768x3072.txt");
+    aie_gemm_256x3072x768_insts = load_insts("build/insts_256x3072x768.txt");
+    aie_gemm_768x256x3072_insts = load_insts("build/insts_768x256x3072.txt");
+    aie_gemm_3072x256x768_insts = load_insts("build/insts_3072x256x768.txt");
+    aie_gemm_768x256x768_insts =  load_insts("build/insts_768x256x768.txt");
+    aie_gemm_256x2304x768_insts = load_insts("build/insts_256x2304x768.txt");
+    aie_gemm_2304x256x768_insts = load_insts("build/insts_2304x256x768.txt");
+    aie_gemm.xclbin_path = "build/final.xclbin";
+    aie_gemm.kernel_name = "MLIR_AIE";
+    aie_gemm.n_bos = 4;
+    aie_gemm.bos[1].len = AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_K*sizeof(std::bfloat16_t);
+    aie_gemm.bos[1].dir = IN_ONLY;
+    aie_gemm.bos[2].len = AIEML_MAX_OFFLOAD_K*AIEML_MAX_OFFLOAD_N*sizeof(std::bfloat16_t);
+    aie_gemm.bos[2].dir = IN_ONLY;
+    aie_gemm.bos[3].len = AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N*sizeof(float);
+    aie_gemm.bos[3].dir = IN_OUT;
+    aie_init_design(&aie_gemm);
+
+    aie_offload[std::make_tuple(256, 768, 2304)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x2304_insts };
+    aie_offload[std::make_tuple(256, 768, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x768_insts };
+    aie_offload[std::make_tuple(256, 768, 3072)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x3072_insts };
+    aie_offload[std::make_tuple(256, 3072, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x3072x768_insts };
+    aie_offload[std::make_tuple(768, 256, 3072)] = (struct aie_offload_gemm_info){ &aie_gemm_768x256x3072_insts };
+    aie_offload[std::make_tuple(3072, 256, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_3072x256x768_insts };
+    aie_offload[std::make_tuple(768, 256, 768)] =  (struct aie_offload_gemm_info){ &aie_gemm_768x256x768_insts };
+    aie_offload[std::make_tuple(256, 2304, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x2304x768_insts };
+    aie_offload[std::make_tuple(2304, 256, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_2304x256x768_insts };
+
+}
+
+
+// --------------------------------------------------------------------------
+// Main matmul implementation
+// --------------------------------------------------------------------------
+
+template<bool inp_is_col_major, bool weight_is_col_major>
+void aie_do_gemm(long M, long K, long N, const float * __restrict inp, const float * __restrict weight, const float * __restrict bias, float * __restrict out) {
+    auto info = aie_offload.find(std::make_tuple(M, K, N));
+
+    std::bfloat16_t *aie_buf_a = aie_gemm.bos[1].buf.bf16;
+    std::bfloat16_t *aie_buf_b = aie_gemm.bos[2].buf.bf16;
+    float *aie_buf_c = aie_gemm.bos[3].buf.f32;
+    // Copy over A
+    if(inp_is_col_major) {
+        // design expects inptus to be row major
+        for (long i = 0; i < M; i++) {
+            for (long j = 0; j < K; j++) {
+                aie_buf_a[i* K + j] = (std::bfloat16_t)inp[i + j * M];
+            }
+        }
+    } else {
+        for (long i = 0; i < M; i++) {
+            for (long j = 0; j < K; j++) {
+                aie_buf_a[i * K + j] = (std::bfloat16_t)inp[i * K + j];
+            }
+        }
+    }
+    // Copy B
+    if(weight_is_col_major) {
+        // new design expects weight to be col major
+        for(long i = 0; i < K*N; i++) {
+            aie_buf_b[i] = (std::bfloat16_t)weight[i];
+        }
+    } else {
+        // need to transpose for row-major weights design
+        for (long i = 0; i < K; i++) {
+            for (long j = 0; j < N; j++) {
+                aie_buf_b[i + j*K] = (std::bfloat16_t)weight[i * N + j];
+            }
+        }
+    }
+
+    // Run
+    aie_init_insts(&aie_gemm, info->second.insts);
+    aie_run_design(&aie_gemm);
+
+    // Write back results
+    memcpy(out, aie_buf_c, M*N*sizeof(out[0]));
+}
+
+
+// --------------------------------------------------------------------------
+// Verification
+// --------------------------------------------------------------------------
+
+// forward decl
+template<bool a_is_col_major, bool b_is_col_major>
+void matmul_reference(float *out, const float* a, const float* b, const float *bias,
+              long M, long K, long N);
+
+float out_ref[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N];
+template<bool inp_is_col_major, bool weight_is_col_major>
+bool validate_matmul(long M, long K, long N, const float * __restrict inp, const float * __restrict weight, const float * __restrict bias, float * __restrict out_to_test) {
+    matmul_reference<inp_is_col_major, weight_is_col_major>(out_ref, inp, weight, NULL, M, K, N);
+    std::vector<float> CRef(out_ref, out_ref+(M*N));
+    std::vector<float> C(out_to_test, out_to_test+(M*N));
+    int n_errors = 0;
+    std::vector<struct matmul_common::error<float>> errors;
+    float max_rel_error = (float)0.0f;
+    for (long row = 0; row < M; row++) {
+        for (long col = 0; col < N; col++) {
+            std::optional<struct matmul_common::error<float>> error = matmul_common::verify_single(
+                std::cout, row, col, CRef[row * N + col], C[row * N + col], 0.5, 0.05);
+            if (error.has_value()) {
+                if (n_errors < 10) {
+                    errors.push_back(*error);
+                }
+                float rel_error =
+                    std::abs(error->actual - error->expected) /
+                    std::max(std::abs(error->actual), std::abs(error->expected));
+                if (rel_error > max_rel_error) {
+                max_rel_error = rel_error;
+                }
+                n_errors++;
+            }
+        }
+    }
+    if (n_errors > 0) {
+        matmul_common::print_error_summary(std::cout, n_errors, errors, max_rel_error);
+        std::cout << std::endl << "Reference:" << std::endl;
+        matmul_common::print_matrix(CRef, N);
+        std::cout << std::endl << "Output:" << std::endl;
+        matmul_common::print_matrix(C, N);
+        return false;
+    }
+    return true;
+}
+
+template<bool a_is_col_major, bool b_is_col_major>
+void matmul_reference(float *out, const float* a, const float* b, const float *bias,
+              long M, long K, long N)
+{
+    const int LOOP_UNROLL = 8;
+    assert(M % LOOP_UNROLL == 0);
+
+    for (int obt = 0; obt < M; obt += LOOP_UNROLL) {
+        for (int o = 0; o < N; o++) {
+            // we'll keep LOOP_UNROLL many results in registers
+            float result[LOOP_UNROLL];
+            // initialize the bias, if it exists
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                //result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+            }
+            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+            // the value of b[i + o * K] and reuse it.
+            // we compile with -Ofast, so the compiler will turn the inner loop into FMAs
+            for (int i = 0; i < K; i++) {
+                float w = (b_is_col_major ? b[i + o * K] : b[i * N + o]);
+                for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                    int bt = obt + ibt;
+                    float inp = (a_is_col_major ? a[bt + i * M] : a[bt * K + i]);
+                    result[ibt] += inp * w;
+                }
+            }
+            // write back results to main memory
+            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+                int bt = obt + ibt;
+                out[bt * N + o] = result[ibt];
+            }
+        }
+    }
+}
+
+
+// --------------------------------------------------------------------------
+// Main
+// --------------------------------------------------------------------------
+
+float A[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_K];
+float B[AIEML_MAX_OFFLOAD_K*AIEML_MAX_OFFLOAD_N];
+float C[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N];
+
+int main(int argc, char **argv) {
+    aie_init();
+    // do three iterations of switching between sizes
+    for(int i = 0; i < 3; i++) {  
+        for(auto it = aie_offload.begin(); it != aie_offload.end(); ++it) {
+            auto [M, K, N] = it->first;
+            for(int j = 0; j < M*K; j++) {
+                A[j] = matmul_common::get_random<std::bfloat16_t>();
+            }
+            for(int j = 0; j < K*N; j++) {
+                B[j] = matmul_common::get_random<std::bfloat16_t>();
+            }
+            printf("Running matmul: %4dx%4dx%4d ...", M, K, N);
+            fflush(stdout);
+            auto tstart = std::chrono::system_clock::now();
+            aie_do_gemm<false, true>(M, K, N, A, B, NULL, C);
+            auto tstop = std::chrono::system_clock::now();
+            float t = std::chrono::duration_cast<std::chrono::microseconds>(tstop - tstart).count();
+            printf(" complete after %6.0fus", t);
+            fflush(stdout);
+            #if VALIDATE
+            if(validate_matmul<false, true>(M, K, N, A, B, NULL, C)) {
+                printf(" - pass!\n");
+            } else {
+                printf("FAIL.\n");
+                exit(0);
+            }
+            #else
+            printf(" - not validated\n");
+            #endif
+        }
+    }
+    printf("PASS!\n"); // We will exit in aie_do_gemm above if verification does not pass.
+    return 0;
+}
diff --git a/programming_examples/basic/matrix_multiplication/rtp/util.py b/programming_examples/basic/matrix_multiplication/rtp/util.py
new file mode 100644
index 0000000000..4e9a635d79
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/util.py
@@ -0,0 +1,52 @@
+# These Python bindings are to be merged in #1699 in the future
+from typing import Optional, Union
+from aie.dialects.aie import *
+
+def get_dma_region_decorator(op_obj_constructor):
+    def decorator(f):
+        f_sig = inspect.signature(f)
+        op = op_obj_constructor()
+        entry_block = op.body.blocks.append()
+        bds_ctx = bds(op)
+        with InsertionPoint(entry_block):
+            with bds_ctx as bd:
+                if len(f_sig.parameters) == 0:
+                    f()
+                elif len(f_sig.parameters) == 1:
+                    f(bd)
+                else:
+                    raise RuntimeError(
+                        "Expected function to take zero or one argument(s)."
+                    )
+        return op
+
+    return decorator
+
+
+def mem(tile):
+    return get_dma_region_decorator(lambda: MemOp(T.index(), tile))
+
+
+def shim_mem(tile):
+    return get_dma_region_decorator(lambda: ShimDMAOp(T.index(), tile))
+
+
+def memtile_dma(tile):
+    return get_dma_region_decorator(lambda: MemTileDMAOp(T.index(), tile))
+
+
+def dma_start(
+    channel_dir,
+    channel_index,
+    *,
+    dest: Optional[Union[Successor, Block, ContextManagedBlock]] = None,
+    chain: Optional[Union[Successor, Block, ContextManagedBlock]] = None,
+    loc=None,
+    ip=None,
+):
+    chain_block = chain.block if isinstance(chain, ContextManagedBlock) else chain
+    dest_block = dest.block if isinstance(dest, ContextManagedBlock) else dest
+    op = DMAStartOp(
+        channel_dir, channel_index, dest=dest_block, chain=chain_block, loc=loc, ip=ip
+    )
+    return op.dest, op.chain
\ No newline at end of file
diff --git a/programming_examples/basic/matrix_multiplication/rtp/zero.cc b/programming_examples/basic/matrix_multiplication/rtp/zero.cc
new file mode 100644
index 0000000000..c2e8306a4d
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/zero.cc
@@ -0,0 +1,40 @@
+//===- zero.cc --------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ZERO_CC
+#define ZERO_CC
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <type_traits>
+
+template <typename T, int M, int N>
+void zero_scalar(T *__restrict c) {
+  for (int i = 0; i < M * N; i++) {
+    c[i] = 0;
+  }
+}
+
+template <typename T, int M, int N, int r>
+void zero_vectorized(T *__restrict c) {
+  const aie::vector<T, r> zeros = aie::zeros<T, r>();
+  const T *__restrict c_end = c + M * N;
+  for (; c + r < c_end; c += r) {
+    aie::store_v(c, zeros);
+  }
+  // Do a scalar write for any remainder not divisible by vector instruction
+  // size r
+  for (; c < c_end; c++) {
+    *c = 0;
+  }
+}
+
+#endif
\ No newline at end of file

From 8bfd0856f5af3784137e69459a395aea4e1d5852 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Mon, 18 Nov 2024 16:42:01 -0700
Subject: [PATCH 2/7] update after changes to Python bindings

---
 .../basic/matrix_multiplication/rtp/aie2.py   | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
index c998a4c6ad..1746e64383 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -15,6 +15,7 @@
 from aie.dialects.scf import *
 import aie.dialects.index as index_dialect
 import aie.dialects.arith as arith_dialect
+import aie.dialects.memref as memref_dialect
 
 from util import *
 
@@ -188,7 +189,7 @@ def device_body():
             for row in range(n_aie_rows):
                 # RTP index 0: "ready" signal
                 # RTP index 1: K // k // 2
-                rtp_bufs[row][col] = buffer(core_tiles[row][col], (3,), T.i32(), f"rtp_{row}_{col}")
+                rtp_bufs[row][col] = buffer(core_tiles[row][col], datatype=T.memref(3, T.i32()), name=f"rtp_{row}_{col}")
 
         # AIE-array data movement with object fifos
         A_l3l2_fifos = [None] * n_aie_cols
@@ -206,16 +207,16 @@ def device_body():
             A_l2l1_fifos[row] = {
                 "prod" : {
                     "endpoint": (mem_tile, WireBundle.DMA, 0),
-                    "ping_buf": buffer(mem_tile, A_l2_memref_ty.shape, dtype_in(), name=f"A_L3L2_{row}_cons_buff_0"),
-                    "pong_buf": buffer(mem_tile, A_l2_memref_ty.shape, dtype_in(), name=f"A_L3L2_{row}_cons_buff_1"),
+                    "ping_buf": buffer(mem_tile, datatype=A_l2_memref_ty, name=f"A_L3L2_{row}_cons_buff_0"),
+                    "pong_buf": buffer(mem_tile, datatype=A_l2_memref_ty, name=f"A_L3L2_{row}_cons_buff_1"),
                     "put_lock": lock(mem_tile, init=2, sym_name=f"A_L3L2_{row}_cons_prod_lock", lock_id=0),
                     "get_lock": lock(mem_tile, init=0, sym_name=f"A_L3L2_{row}_cons_cons_lock", lock_id=1)
                 },
                 "cons" : [
                     {  
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
-                        "ping_buf": buffer(core_tiles[row][col], A_l1_memref_ty.shape, dtype_in(), name=f"A_L2L1_{row}_{col}_cons_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], A_l1_memref_ty.shape, dtype_in(), name=f"A_L2L1_{row}_{col}_cons_buff_1"),
+                        "ping_buf": buffer(core_tiles[row][col], datatype=A_l1_memref_ty, name=f"A_L2L1_{row}_{col}_cons_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], datatype=A_l1_memref_ty, name=f"A_L2L1_{row}_{col}_cons_buff_1"),
                         "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"A_L2L1_{row}_{col}_cons_prod_lock", lock_id=0),
                         "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"A_L2L1_{row}_{col}_cons_cons_lock", lock_id=1)
                     }
@@ -235,7 +236,7 @@ def device_body():
             A_l3l2_fifos[col] = {
                 "prod" : {
                     "endpoint": (shim_tile, WireBundle.DMA, 0),
-                    "shim_memref": memref.global_(sym_name=f"A_L3L2_{col}", sym_visibility="public", type_=A_l3_memref_ty),
+                    "shim_memref": memref_dialect.global_(sym_name=f"A_L3L2_{col}", sym_visibility="public", type_=A_l3_memref_ty),
                     "shim_dma_alloc": ShimDMAAllocationOp(f"A_L3L2_{col}", DMAChannelDir.MM2S, 0, col=col)
                 },
                 "cons" : {
@@ -253,16 +254,16 @@ def device_body():
             B_l2l1_fifos[col] = {
                 "prod" : {
                     "endpoint": (mem_tile, WireBundle.DMA, 1),
-                    "ping_buf": buffer(mem_tile, B_l2_memref_ty.shape, dtype_in(), name=f"B_L3L2_{col}_cons_buff_0"),
-                    "pong_buf": buffer(mem_tile, B_l2_memref_ty.shape, dtype_in(), name=f"B_L3L2_{col}_cons_buff_1"),
+                    "ping_buf": buffer(mem_tile, datatype=B_l2_memref_ty, name=f"B_L3L2_{col}_cons_buff_0"),
+                    "pong_buf": buffer(mem_tile, datatype=B_l2_memref_ty, name=f"B_L3L2_{col}_cons_buff_1"),
                     "put_lock": lock(mem_tile, init=2, sym_name=f"B_L3L2_{col}_cons_prod_lock", lock_id=2),
                     "get_lock": lock(mem_tile, init=0, sym_name=f"B_L3L2_{col}_cons_cons_lock", lock_id=3)
                 },
                 "cons" : [
                     {  
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 1),
-                        "ping_buf": buffer(core_tiles[row][col], B_l1_memref_ty.shape, dtype_in(), name=f"B_L2L1_{col}_{row}_cons_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], B_l1_memref_ty.shape, dtype_in(), name=f"B_L2L1_{col}_{row}_cons_buff_1"),
+                        "ping_buf": buffer(core_tiles[row][col], datatype=B_l1_memref_ty, name=f"B_L2L1_{col}_{row}_cons_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], datatype=B_l1_memref_ty, name=f"B_L2L1_{col}_{row}_cons_buff_1"),
                         "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"B_L2L1_{col}_{row}_cons_prod_lock", lock_id=2),
                         "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"B_L2L1_{col}_{row}_cons_cons_lock", lock_id=3)
                     }
@@ -282,7 +283,7 @@ def device_body():
             B_l3l2_fifos[col] = {
                 "prod" : {
                     "endpoint": (shim_tile, WireBundle.DMA, 1),
-                    "shim_memref": memref.global_(sym_name=f"B_L3L2_{col}", sym_visibility="public", type_=B_l3_memref_ty),
+                    "shim_memref_dialect": memref_dialect.global_(sym_name=f"B_L3L2_{col}", sym_visibility="public", type_=B_l3_memref_ty),
                     "shim_dma_alloc": ShimDMAAllocationOp(f"B_L3L2_{col}", DMAChannelDir.MM2S, 1, col=col)
                 },
                 "cons" : {
@@ -300,8 +301,8 @@ def device_body():
                 C_l1l2_fifos[row][col] = {
                     "prod" : {
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
-                        "ping_buf": buffer(core_tiles[row][col], C_l1_memref_ty.shape, dtype_out(), name=f"C_L1L2_{col}_{row}_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], C_l1_memref_ty.shape, dtype_out(), name=f"C_L1L2_{col}_{row}_buff_1"),
+                        "ping_buf": buffer(core_tiles[row][col], datatype=C_l1_memref_ty, name=f"C_L1L2_{col}_{row}_buff_0"),
+                        "pong_buf": buffer(core_tiles[row][col], datatype=C_l1_memref_ty, name=f"C_L1L2_{col}_{row}_buff_1"),
                         "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"C_L1L2_{col}_{row}_prod_lock", lock_id=4),
                         "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"C_L1L2_{col}_{row}_cons_lock", lock_id=5)
                     },
@@ -321,14 +322,14 @@ def device_body():
             C_l2l3_fifos[col] = {
                 "prod" : {
                     "endpoint": (mem_tiles[col], WireBundle.DMA, 2),
-                    "ping_buf": buffer(mem_tiles[col], C_l2_memref_ty.shape, dtype_out(), name=f"C_L2L3_{col}_buff_0"),
-                    "pong_buf": buffer(mem_tiles[col], C_l2_memref_ty.shape, dtype_out(), name=f"C_L2L3_{col}_buff_1"),
+                    "ping_buf": buffer(mem_tiles[col], datatype=C_l2_memref_ty, name=f"C_L2L3_{col}_buff_0"),
+                    "pong_buf": buffer(mem_tiles[col], datatype=C_l2_memref_ty, name=f"C_L2L3_{col}_buff_1"),
                     "put_lock": lock(mem_tiles[col], init=4*2, sym_name=f"C_L2L3_{col}_prod_lock", lock_id=4),
                     "get_lock": lock(mem_tiles[col], init=0, sym_name=f"C_L2L3_{col}_cons_lock", lock_id=5)
                 },
                 "cons" : {  
                     "endpoint": (shim_tiles[col], WireBundle.DMA, 0),
-                    "shim_memref": memref.global_(sym_name=f"C_L2L3_{col}", sym_visibility="public", type_=C_l3_memref_ty),
+                    "shim_memref": memref_dialect.global_(sym_name=f"C_L2L3_{col}", sym_visibility="public", type_=C_l3_memref_ty),
                     "shim_dma_alloc": ShimDMAAllocationOp(f"C_L2L3_{col}", DMAChannelDir.S2MM, 0, col=col)
                 }
             }

From 284979a25fdb4a2cb934b0167accd00b11070027 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Mon, 18 Nov 2024 17:47:32 -0700
Subject: [PATCH 3/7] format

---
 .../basic/matrix_multiplication/rtp/aie2.py   | 692 +++++++++++++-----
 .../matrix_multiplication/rtp/await_rtp.cc    |  14 +-
 .../basic/matrix_multiplication/rtp/kernel.cc |  11 +-
 .../basic/matrix_multiplication/rtp/test.cpp  | 593 ++++++++-------
 4 files changed, 836 insertions(+), 474 deletions(-)

diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
index 1746e64383..e23361e92f 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -19,8 +19,9 @@
 
 from util import *
 
+
 def get_memref_len_elems(memref):
-    out = 1 
+    out = 1
     for s in memref.shape:
         out *= s
     return out
@@ -84,7 +85,7 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str):
         dtype_out = T.f32
     elif dtype_out_str == "i32":
         dtype_out = T.i32
-    
+
     assert dtype_in == T.bf16
     assert dtype_out == T.f32
 
@@ -165,14 +166,9 @@ def device_body():
             f"matmul_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
         )
-        await_rtp = external_func(
-            f"await_rtp",
-            inputs=[rtp_ty]
-        )
+        await_rtp = external_func(f"await_rtp", inputs=[rtp_ty])
         get_volatile_rtp = external_func(
-            f"get_volatile_rtp",
-            inputs=[rtp_ty, T.i32()],
-            outputs=[T.i32()]
+            f"get_volatile_rtp", inputs=[rtp_ty, T.i32()], outputs=[T.i32()]
         )
 
         # Tile declarations as tile[row][col]
@@ -189,7 +185,11 @@ def device_body():
             for row in range(n_aie_rows):
                 # RTP index 0: "ready" signal
                 # RTP index 1: K // k // 2
-                rtp_bufs[row][col] = buffer(core_tiles[row][col], datatype=T.memref(3, T.i32()), name=f"rtp_{row}_{col}")
+                rtp_bufs[row][col] = buffer(
+                    core_tiles[row][col],
+                    datatype=T.memref(3, T.i32()),
+                    name=f"rtp_{row}_{col}",
+                )
 
         # AIE-array data movement with object fifos
         A_l3l2_fifos = [None] * n_aie_cols
@@ -205,143 +205,283 @@ def device_body():
         for row in range(n_aie_rows):
             mem_tile = mem_tiles[row // n_A_tiles_per_shim]
             A_l2l1_fifos[row] = {
-                "prod" : {
+                "prod": {
                     "endpoint": (mem_tile, WireBundle.DMA, 0),
-                    "ping_buf": buffer(mem_tile, datatype=A_l2_memref_ty, name=f"A_L3L2_{row}_cons_buff_0"),
-                    "pong_buf": buffer(mem_tile, datatype=A_l2_memref_ty, name=f"A_L3L2_{row}_cons_buff_1"),
-                    "put_lock": lock(mem_tile, init=2, sym_name=f"A_L3L2_{row}_cons_prod_lock", lock_id=0),
-                    "get_lock": lock(mem_tile, init=0, sym_name=f"A_L3L2_{row}_cons_cons_lock", lock_id=1)
+                    "ping_buf": buffer(
+                        mem_tile,
+                        datatype=A_l2_memref_ty,
+                        name=f"A_L3L2_{row}_cons_buff_0",
+                    ),
+                    "pong_buf": buffer(
+                        mem_tile,
+                        datatype=A_l2_memref_ty,
+                        name=f"A_L3L2_{row}_cons_buff_1",
+                    ),
+                    "put_lock": lock(
+                        mem_tile,
+                        init=2,
+                        sym_name=f"A_L3L2_{row}_cons_prod_lock",
+                        lock_id=0,
+                    ),
+                    "get_lock": lock(
+                        mem_tile,
+                        init=0,
+                        sym_name=f"A_L3L2_{row}_cons_cons_lock",
+                        lock_id=1,
+                    ),
                 },
-                "cons" : [
-                    {  
+                "cons": [
+                    {
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
-                        "ping_buf": buffer(core_tiles[row][col], datatype=A_l1_memref_ty, name=f"A_L2L1_{row}_{col}_cons_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], datatype=A_l1_memref_ty, name=f"A_L2L1_{row}_{col}_cons_buff_1"),
-                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"A_L2L1_{row}_{col}_cons_prod_lock", lock_id=0),
-                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"A_L2L1_{row}_{col}_cons_cons_lock", lock_id=1)
+                        "ping_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=A_l1_memref_ty,
+                            name=f"A_L2L1_{row}_{col}_cons_buff_0",
+                        ),
+                        "pong_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=A_l1_memref_ty,
+                            name=f"A_L2L1_{row}_{col}_cons_buff_1",
+                        ),
+                        "put_lock": lock(
+                            core_tiles[row][col],
+                            init=2,
+                            sym_name=f"A_L2L1_{row}_{col}_cons_prod_lock",
+                            lock_id=0,
+                        ),
+                        "get_lock": lock(
+                            core_tiles[row][col],
+                            init=0,
+                            sym_name=f"A_L2L1_{row}_{col}_cons_cons_lock",
+                            lock_id=1,
+                        ),
                     }
                     for col in range(n_aie_cols)
-                ]  # broadcast along one row
+                ],  # broadcast along one row
             }
             for col in range(n_aie_cols):
-                src_tile, src_bundle, src_channel = A_l2l1_fifos[row]["prod"]["endpoint"]
-                dst_tile, dst_bundle, dst_channel = A_l2l1_fifos[row]["cons"][col]["endpoint"]
-                flow(src_tile, src_bundle, src_channel,
-                     dst_tile, dst_bundle, dst_channel)
+                src_tile, src_bundle, src_channel = A_l2l1_fifos[row]["prod"][
+                    "endpoint"
+                ]
+                dst_tile, dst_bundle, dst_channel = A_l2l1_fifos[row]["cons"][col][
+                    "endpoint"
+                ]
+                flow(
+                    src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel
+                )
 
         # Input A, L3 -> L2
         for col in range(n_aie_cols):
             shim_tile = shim_tiles[col]
             mem_tile = mem_tiles[col]
             A_l3l2_fifos[col] = {
-                "prod" : {
+                "prod": {
                     "endpoint": (shim_tile, WireBundle.DMA, 0),
-                    "shim_memref": memref_dialect.global_(sym_name=f"A_L3L2_{col}", sym_visibility="public", type_=A_l3_memref_ty),
-                    "shim_dma_alloc": ShimDMAAllocationOp(f"A_L3L2_{col}", DMAChannelDir.MM2S, 0, col=col)
+                    "shim_memref": memref_dialect.global_(
+                        sym_name=f"A_L3L2_{col}",
+                        sym_visibility="public",
+                        type_=A_l3_memref_ty,
+                    ),
+                    "shim_dma_alloc": ShimDMAAllocationOp(
+                        f"A_L3L2_{col}", DMAChannelDir.MM2S, 0, col=col
+                    ),
                 },
-                "cons" : {
+                "cons": {
                     "endpoint": (mem_tile, WireBundle.DMA, 0),
-                }
+                },
             }
             src_tile, src_bundle, src_channel = A_l3l2_fifos[col]["prod"]["endpoint"]
             dst_tile, dst_bundle, dst_channel = A_l3l2_fifos[col]["cons"]["endpoint"]
-            flow(src_tile, src_bundle, src_channel,
-                    dst_tile, dst_bundle, dst_channel)
+            flow(src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel)
 
         # Input B, L2 -> L1
         for col in range(n_aie_cols):
             mem_tile = mem_tiles[col]
             B_l2l1_fifos[col] = {
-                "prod" : {
+                "prod": {
                     "endpoint": (mem_tile, WireBundle.DMA, 1),
-                    "ping_buf": buffer(mem_tile, datatype=B_l2_memref_ty, name=f"B_L3L2_{col}_cons_buff_0"),
-                    "pong_buf": buffer(mem_tile, datatype=B_l2_memref_ty, name=f"B_L3L2_{col}_cons_buff_1"),
-                    "put_lock": lock(mem_tile, init=2, sym_name=f"B_L3L2_{col}_cons_prod_lock", lock_id=2),
-                    "get_lock": lock(mem_tile, init=0, sym_name=f"B_L3L2_{col}_cons_cons_lock", lock_id=3)
+                    "ping_buf": buffer(
+                        mem_tile,
+                        datatype=B_l2_memref_ty,
+                        name=f"B_L3L2_{col}_cons_buff_0",
+                    ),
+                    "pong_buf": buffer(
+                        mem_tile,
+                        datatype=B_l2_memref_ty,
+                        name=f"B_L3L2_{col}_cons_buff_1",
+                    ),
+                    "put_lock": lock(
+                        mem_tile,
+                        init=2,
+                        sym_name=f"B_L3L2_{col}_cons_prod_lock",
+                        lock_id=2,
+                    ),
+                    "get_lock": lock(
+                        mem_tile,
+                        init=0,
+                        sym_name=f"B_L3L2_{col}_cons_cons_lock",
+                        lock_id=3,
+                    ),
                 },
-                "cons" : [
-                    {  
+                "cons": [
+                    {
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 1),
-                        "ping_buf": buffer(core_tiles[row][col], datatype=B_l1_memref_ty, name=f"B_L2L1_{col}_{row}_cons_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], datatype=B_l1_memref_ty, name=f"B_L2L1_{col}_{row}_cons_buff_1"),
-                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"B_L2L1_{col}_{row}_cons_prod_lock", lock_id=2),
-                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"B_L2L1_{col}_{row}_cons_cons_lock", lock_id=3)
+                        "ping_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=B_l1_memref_ty,
+                            name=f"B_L2L1_{col}_{row}_cons_buff_0",
+                        ),
+                        "pong_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=B_l1_memref_ty,
+                            name=f"B_L2L1_{col}_{row}_cons_buff_1",
+                        ),
+                        "put_lock": lock(
+                            core_tiles[row][col],
+                            init=2,
+                            sym_name=f"B_L2L1_{col}_{row}_cons_prod_lock",
+                            lock_id=2,
+                        ),
+                        "get_lock": lock(
+                            core_tiles[row][col],
+                            init=0,
+                            sym_name=f"B_L2L1_{col}_{row}_cons_cons_lock",
+                            lock_id=3,
+                        ),
                     }
                     for row in range(n_aie_rows)
-                ]  # broadcast along one column
+                ],  # broadcast along one column
             }
             for row in range(n_aie_rows):
-                src_tile, src_bundle, src_channel = B_l2l1_fifos[col]["prod"]["endpoint"]
-                dst_tile, dst_bundle, dst_channel = B_l2l1_fifos[col]["cons"][row]["endpoint"]
-                flow(src_tile, src_bundle, src_channel,
-                        dst_tile, dst_bundle, dst_channel)
-        
+                src_tile, src_bundle, src_channel = B_l2l1_fifos[col]["prod"][
+                    "endpoint"
+                ]
+                dst_tile, dst_bundle, dst_channel = B_l2l1_fifos[col]["cons"][row][
+                    "endpoint"
+                ]
+                flow(
+                    src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel
+                )
+
         # Input B, L3 -> L2
         for col in range(n_aie_cols):
             mem_tile = mem_tiles[col]
             shim_tile = shim_tiles[col]
             B_l3l2_fifos[col] = {
-                "prod" : {
+                "prod": {
                     "endpoint": (shim_tile, WireBundle.DMA, 1),
-                    "shim_memref_dialect": memref_dialect.global_(sym_name=f"B_L3L2_{col}", sym_visibility="public", type_=B_l3_memref_ty),
-                    "shim_dma_alloc": ShimDMAAllocationOp(f"B_L3L2_{col}", DMAChannelDir.MM2S, 1, col=col)
+                    "shim_memref_dialect": memref_dialect.global_(
+                        sym_name=f"B_L3L2_{col}",
+                        sym_visibility="public",
+                        type_=B_l3_memref_ty,
+                    ),
+                    "shim_dma_alloc": ShimDMAAllocationOp(
+                        f"B_L3L2_{col}", DMAChannelDir.MM2S, 1, col=col
+                    ),
                 },
-                "cons" : {
-                    "endpoint": (mem_tile, WireBundle.DMA, 1)
-                }
+                "cons": {"endpoint": (mem_tile, WireBundle.DMA, 1)},
             }
             src_tile, src_bundle, src_channel = B_l3l2_fifos[col]["prod"]["endpoint"]
             dst_tile, dst_bundle, dst_channel = B_l3l2_fifos[col]["cons"]["endpoint"]
-            flow(src_tile, src_bundle, src_channel,
-                 dst_tile, dst_bundle, dst_channel)
+            flow(src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel)
 
         # Output C, L1 -> L2
         for col in range(n_aie_cols):
             for row in range(n_aie_rows):
                 C_l1l2_fifos[row][col] = {
-                    "prod" : {
+                    "prod": {
                         "endpoint": (core_tiles[row][col], WireBundle.DMA, 0),
-                        "ping_buf": buffer(core_tiles[row][col], datatype=C_l1_memref_ty, name=f"C_L1L2_{col}_{row}_buff_0"),
-                        "pong_buf": buffer(core_tiles[row][col], datatype=C_l1_memref_ty, name=f"C_L1L2_{col}_{row}_buff_1"),
-                        "put_lock": lock(core_tiles[row][col], init=2, sym_name=f"C_L1L2_{col}_{row}_prod_lock", lock_id=4),
-                        "get_lock": lock(core_tiles[row][col], init=0, sym_name=f"C_L1L2_{col}_{row}_cons_lock", lock_id=5)
+                        "ping_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=C_l1_memref_ty,
+                            name=f"C_L1L2_{col}_{row}_buff_0",
+                        ),
+                        "pong_buf": buffer(
+                            core_tiles[row][col],
+                            datatype=C_l1_memref_ty,
+                            name=f"C_L1L2_{col}_{row}_buff_1",
+                        ),
+                        "put_lock": lock(
+                            core_tiles[row][col],
+                            init=2,
+                            sym_name=f"C_L1L2_{col}_{row}_prod_lock",
+                            lock_id=4,
+                        ),
+                        "get_lock": lock(
+                            core_tiles[row][col],
+                            init=0,
+                            sym_name=f"C_L1L2_{col}_{row}_cons_lock",
+                            lock_id=5,
+                        ),
+                    },
+                    "cons": {
+                        "endpoint": (
+                            mem_tiles[col],
+                            WireBundle.DMA,
+                            row
+                            + 2,  # S2MM channels 0, 1 on memtile are used for A, B coming in from shim
+                        ),
                     },
-                    "cons" : {  
-                        "endpoint": (mem_tiles[col], WireBundle.DMA, 
-                                     row + 2  # S2MM channels 0, 1 on memtile are used for A, B coming in from shim
-                                     ),
-                    }
                 }
-                src_tile, src_bundle, src_channel = C_l1l2_fifos[row][col]["prod"]["endpoint"]
-                dst_tile, dst_bundle, dst_channel = C_l1l2_fifos[row][col]["cons"]["endpoint"]
-                flow(src_tile, src_bundle, src_channel,
-                     dst_tile, dst_bundle, dst_channel)
+                src_tile, src_bundle, src_channel = C_l1l2_fifos[row][col]["prod"][
+                    "endpoint"
+                ]
+                dst_tile, dst_bundle, dst_channel = C_l1l2_fifos[row][col]["cons"][
+                    "endpoint"
+                ]
+                flow(
+                    src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel
+                )
 
         # Output C, L2 -> L3
         for col in range(n_aie_cols):
             C_l2l3_fifos[col] = {
-                "prod" : {
+                "prod": {
                     "endpoint": (mem_tiles[col], WireBundle.DMA, 2),
-                    "ping_buf": buffer(mem_tiles[col], datatype=C_l2_memref_ty, name=f"C_L2L3_{col}_buff_0"),
-                    "pong_buf": buffer(mem_tiles[col], datatype=C_l2_memref_ty, name=f"C_L2L3_{col}_buff_1"),
-                    "put_lock": lock(mem_tiles[col], init=4*2, sym_name=f"C_L2L3_{col}_prod_lock", lock_id=4),
-                    "get_lock": lock(mem_tiles[col], init=0, sym_name=f"C_L2L3_{col}_cons_lock", lock_id=5)
+                    "ping_buf": buffer(
+                        mem_tiles[col],
+                        datatype=C_l2_memref_ty,
+                        name=f"C_L2L3_{col}_buff_0",
+                    ),
+                    "pong_buf": buffer(
+                        mem_tiles[col],
+                        datatype=C_l2_memref_ty,
+                        name=f"C_L2L3_{col}_buff_1",
+                    ),
+                    "put_lock": lock(
+                        mem_tiles[col],
+                        init=4 * 2,
+                        sym_name=f"C_L2L3_{col}_prod_lock",
+                        lock_id=4,
+                    ),
+                    "get_lock": lock(
+                        mem_tiles[col],
+                        init=0,
+                        sym_name=f"C_L2L3_{col}_cons_lock",
+                        lock_id=5,
+                    ),
                 },
-                "cons" : {  
+                "cons": {
                     "endpoint": (shim_tiles[col], WireBundle.DMA, 0),
-                    "shim_memref": memref_dialect.global_(sym_name=f"C_L2L3_{col}", sym_visibility="public", type_=C_l3_memref_ty),
-                    "shim_dma_alloc": ShimDMAAllocationOp(f"C_L2L3_{col}", DMAChannelDir.S2MM, 0, col=col)
-                }
+                    "shim_memref": memref_dialect.global_(
+                        sym_name=f"C_L2L3_{col}",
+                        sym_visibility="public",
+                        type_=C_l3_memref_ty,
+                    ),
+                    "shim_dma_alloc": ShimDMAAllocationOp(
+                        f"C_L2L3_{col}", DMAChannelDir.S2MM, 0, col=col
+                    ),
+                },
             }
             src_tile, src_bundle, src_channel = C_l2l3_fifos[col]["prod"]["endpoint"]
             dst_tile, dst_bundle, dst_channel = C_l2l3_fifos[col]["cons"]["endpoint"]
-            flow(src_tile, src_bundle, src_channel,
-                 dst_tile, dst_bundle, dst_channel)
+            flow(src_tile, src_bundle, src_channel, dst_tile, dst_bundle, dst_channel)
 
         # Set up the data movement
 
         # Mem tiles
         for col in range(n_aie_cols):
+
             @memtile_dma(mem_tiles[col])
             def memtile_body(block):
 
@@ -350,11 +490,24 @@ def memtile_body(block):
                 A_l2l1_fifo = A_l2l1_fifos[col]["prod"]
                 _, _, a_in_channel = A_l3l2_fifo["endpoint"]
                 _ = block["a_in_ping"], block["a_in_pong"]
-                dma_start(DMAChannelDir.S2MM, a_in_channel, dest=block["a_in_ping"], chain=block["a_out"])
+                dma_start(
+                    DMAChannelDir.S2MM,
+                    a_in_channel,
+                    dest=block["a_in_ping"],
+                    chain=block["a_out"],
+                )
                 for pp in ["ping", "pong"]:
                     with block[f"a_in_{pp}"]:
-                        use_lock(A_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
-                        dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l2_memref_ty))
+                        use_lock(
+                            A_l2l1_fifo["put_lock"],
+                            LockAction.AcquireGreaterEqual,
+                            value=1,
+                        )
+                        dma_bd(
+                            A_l2l1_fifo[f"{pp}_buf"],
+                            offset=0,
+                            len=get_memref_len_elems(A_l2_memref_ty),
+                        )
                         use_lock(A_l2l1_fifo["get_lock"], LockAction.Release, value=1)
                         next_bd(block[f"a_in_{'pong' if pp == 'ping' else 'ping'}"])
 
@@ -363,20 +516,39 @@ def memtile_body(block):
                     A_l2l1_fifo = A_l2l1_fifos[col]["prod"]
                     _, _, a_out_channel = A_l2l1_fifo["endpoint"]
                     _ = block["a_out_ping"], block["a_out_pong"]
-                    dma_start(DMAChannelDir.MM2S, a_out_channel, dest=block["a_out_ping"], chain=block["b_in"])
+                    dma_start(
+                        DMAChannelDir.MM2S,
+                        a_out_channel,
+                        dest=block["a_out_ping"],
+                        chain=block["b_in"],
+                    )
                     for pp in ["ping", "pong"]:
                         with block[f"a_out_{pp}"]:
-                            use_lock(A_l2l1_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
-                            assert get_memref_len_elems(A_l1_memref_ty) == get_memref_len_elems(A_l2_memref_ty)
-                            dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l1_memref_ty),
-                                   dimensions=[
-                                                  (m // r, r * k),
-                                                  (k // s, s),
-                                                  (r, k),
-                                                  (s, 1),
-                                              ])
-                            use_lock(A_l2l1_fifo["put_lock"], LockAction.Release, value=1)
-                            next_bd(block[f"a_out_{'pong' if pp == 'ping' else 'ping'}"])
+                            use_lock(
+                                A_l2l1_fifo["get_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=1,
+                            )
+                            assert get_memref_len_elems(
+                                A_l1_memref_ty
+                            ) == get_memref_len_elems(A_l2_memref_ty)
+                            dma_bd(
+                                A_l2l1_fifo[f"{pp}_buf"],
+                                offset=0,
+                                len=get_memref_len_elems(A_l1_memref_ty),
+                                dimensions=[
+                                    (m // r, r * k),
+                                    (k // s, s),
+                                    (r, k),
+                                    (s, 1),
+                                ],
+                            )
+                            use_lock(
+                                A_l2l1_fifo["put_lock"], LockAction.Release, value=1
+                            )
+                            next_bd(
+                                block[f"a_out_{'pong' if pp == 'ping' else 'ping'}"]
+                            )
 
                 # B input
                 with block["b_in"]:
@@ -384,34 +556,68 @@ def memtile_body(block):
                     B_l2l1_fifo = B_l2l1_fifos[col]["prod"]
                     _, _, b_in_channel = B_l3l2_fifo["endpoint"]
                     _ = block["b_in_ping"], block["b_in_pong"]
-                    dma_start(DMAChannelDir.S2MM, b_in_channel, dest=block["b_in_ping"], chain=block["b_out"])
+                    dma_start(
+                        DMAChannelDir.S2MM,
+                        b_in_channel,
+                        dest=block["b_in_ping"],
+                        chain=block["b_out"],
+                    )
                     for pp in ["ping", "pong"]:
                         with block[f"b_in_{pp}"]:
-                            use_lock(B_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
-                            dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l2_memref_ty))
-                            use_lock(B_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                            use_lock(
+                                B_l2l1_fifo["put_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=1,
+                            )
+                            dma_bd(
+                                B_l2l1_fifo[f"{pp}_buf"],
+                                offset=0,
+                                len=get_memref_len_elems(B_l2_memref_ty),
+                            )
+                            use_lock(
+                                B_l2l1_fifo["get_lock"], LockAction.Release, value=1
+                            )
                             next_bd(block[f"b_in_{'pong' if pp == 'ping' else 'ping'}"])
-                
+
                 # B output
                 with block["b_out"]:
                     B_l2l1_fifo = B_l2l1_fifos[col]["prod"]
                     _, _, b_out_channel = B_l2l1_fifo["endpoint"]
                     _ = block["b_out_ping"], block["b_out_pong"]
-                    dma_start(DMAChannelDir.MM2S, b_out_channel, dest=block["b_out_ping"], chain=block["c_in_0"])
+                    dma_start(
+                        DMAChannelDir.MM2S,
+                        b_out_channel,
+                        dest=block["b_out_ping"],
+                        chain=block["c_in_0"],
+                    )
                     for pp in ["ping", "pong"]:
                         with block[f"b_out_{pp}"]:
-                            use_lock(B_l2l1_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
-                            assert get_memref_len_elems(B_l2_memref_ty) == get_memref_len_elems(B_l1_memref_ty)
-                            dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l1_memref_ty),
-                                   dimensions=[
-                                                  (n // t, t * k),
-                                                  (k // s, s),
-                                                  (t, k),
-                                                  (s, 1),
-                                              ])
-                            use_lock(B_l2l1_fifo["put_lock"], LockAction.Release, value=1)
-                            next_bd(block[f"b_out_{'pong' if pp == 'ping' else 'ping'}"])
-                
+                            use_lock(
+                                B_l2l1_fifo["get_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=1,
+                            )
+                            assert get_memref_len_elems(
+                                B_l2_memref_ty
+                            ) == get_memref_len_elems(B_l1_memref_ty)
+                            dma_bd(
+                                B_l2l1_fifo[f"{pp}_buf"],
+                                offset=0,
+                                len=get_memref_len_elems(B_l1_memref_ty),
+                                dimensions=[
+                                    (n // t, t * k),
+                                    (k // s, s),
+                                    (t, k),
+                                    (s, 1),
+                                ],
+                            )
+                            use_lock(
+                                B_l2l1_fifo["put_lock"], LockAction.Release, value=1
+                            )
+                            next_bd(
+                                block[f"b_out_{'pong' if pp == 'ping' else 'ping'}"]
+                            )
+
                 # C input
                 for row in range(n_aie_rows):
                     C_l2l3_fifo = C_l2l3_fifos[col]["prod"]
@@ -419,35 +625,72 @@ def memtile_body(block):
                         C_l1l2_fifo = C_l1l2_fifos[row][col]["cons"]
                         _, _, c_in_channel = C_l1l2_fifo["endpoint"]
                         _ = block[f"c_in_{row}_ping"], block[f"c_in_{row}_pong"]
-                        dma_start(DMAChannelDir.S2MM, c_in_channel, dest=block[f"c_in_{row}_ping"], 
-                                  chain=block[f"c_in_{row+1}" if row+1 < n_aie_rows else "c_out"])
+                        dma_start(
+                            DMAChannelDir.S2MM,
+                            c_in_channel,
+                            dest=block[f"c_in_{row}_ping"],
+                            chain=block[
+                                f"c_in_{row+1}" if row + 1 < n_aie_rows else "c_out"
+                            ],
+                        )
                         for pp in ["ping", "pong"]:
                             with block[f"c_in_{row}_{pp}"]:
-                                use_lock(C_l2l3_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
-                                dma_bd(C_l2l3_fifo[f"{pp}_buf"], 
+                                use_lock(
+                                    C_l2l3_fifo["put_lock"],
+                                    LockAction.AcquireGreaterEqual,
+                                    value=1,
+                                )
+                                dma_bd(
+                                    C_l2l3_fifo[f"{pp}_buf"],
                                     offset=row * get_memref_len_elems(C_l1_memref_ty),
-                                    len=get_memref_len_elems(C_l1_memref_ty))
-                                use_lock(C_l2l3_fifo["get_lock"], LockAction.Release, value=1)
-                                next_bd(block[f"c_in_{row}_{'pong' if pp == 'ping' else 'ping'}"])
+                                    len=get_memref_len_elems(C_l1_memref_ty),
+                                )
+                                use_lock(
+                                    C_l2l3_fifo["get_lock"], LockAction.Release, value=1
+                                )
+                                next_bd(
+                                    block[
+                                        f"c_in_{row}_{'pong' if pp == 'ping' else 'ping'}"
+                                    ]
+                                )
 
                 # C output
                 with block["c_out"]:
                     _, _, c_out_channel = C_l2l3_fifo["endpoint"]
                     _ = block["c_out_ping"], block["c_out_pong"]
-                    dma_start(DMAChannelDir.MM2S, c_out_channel, dest=block["c_out_ping"], chain=block["end"])
+                    dma_start(
+                        DMAChannelDir.MM2S,
+                        c_out_channel,
+                        dest=block["c_out_ping"],
+                        chain=block["end"],
+                    )
                     for pp in ["ping", "pong"]:
                         with block[f"c_out_{pp}"]:
-                            use_lock(C_l2l3_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=4)
-                            assert get_memref_len_elems(C_l2_memref_ty) == 4*get_memref_len_elems(C_l1_memref_ty)
-                            dma_bd(C_l2l3_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(C_l2_memref_ty),
-                                   dimensions=[
-                                                  (m // r, r * n),
-                                                  (r, t),
-                                                  (n // t, r * t),
-                                                  (t, 1),
-                                              ])
-                            use_lock(C_l2l3_fifo["put_lock"], LockAction.Release, value=4)
-                            next_bd(block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"])
+                            use_lock(
+                                C_l2l3_fifo["get_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=4,
+                            )
+                            assert get_memref_len_elems(
+                                C_l2_memref_ty
+                            ) == 4 * get_memref_len_elems(C_l1_memref_ty)
+                            dma_bd(
+                                C_l2l3_fifo[f"{pp}_buf"],
+                                offset=0,
+                                len=get_memref_len_elems(C_l2_memref_ty),
+                                dimensions=[
+                                    (m // r, r * n),
+                                    (r, t),
+                                    (n // t, r * t),
+                                    (t, 1),
+                                ],
+                            )
+                            use_lock(
+                                C_l2l3_fifo["put_lock"], LockAction.Release, value=4
+                            )
+                            next_bd(
+                                block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"]
+                            )
 
                 with block["end"]:
                     EndOp()
@@ -455,6 +698,7 @@ def memtile_body(block):
         # core DMAs
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
+
                 @mem(core_tiles[row][col])
                 def core_mem_body(block):
 
@@ -462,12 +706,27 @@ def core_mem_body(block):
                     A_l2l1_fifo = A_l2l1_fifos[row]["cons"][col]
                     _, _, a_in_channel = A_l2l1_fifo["endpoint"]
                     _ = block["a_in_ping"], block["a_in_pong"]
-                    dma_start(DMAChannelDir.S2MM, a_in_channel, dest=block["a_in_ping"], chain=block["b_in"])
+                    dma_start(
+                        DMAChannelDir.S2MM,
+                        a_in_channel,
+                        dest=block["a_in_ping"],
+                        chain=block["b_in"],
+                    )
                     for pp in ["ping", "pong"]:
                         with block[f"a_in_{pp}"]:
-                            use_lock(A_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
-                            dma_bd(A_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(A_l1_memref_ty))
-                            use_lock(A_l2l1_fifo["get_lock"], LockAction.Release, value=1)
+                            use_lock(
+                                A_l2l1_fifo["put_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=1,
+                            )
+                            dma_bd(
+                                A_l2l1_fifo[f"{pp}_buf"],
+                                offset=0,
+                                len=get_memref_len_elems(A_l1_memref_ty),
+                            )
+                            use_lock(
+                                A_l2l1_fifo["get_lock"], LockAction.Release, value=1
+                            )
                             next_bd(block[f"a_in_{'pong' if pp == 'ping' else 'ping'}"])
 
                     # B input
@@ -475,32 +734,64 @@ def core_mem_body(block):
                         B_l2l1_fifo = B_l2l1_fifos[col]["cons"][row]
                         _, _, b_in_channel = B_l2l1_fifo["endpoint"]
                         _ = block["b_in_ping"], block["b_in_pong"]
-                        dma_start(DMAChannelDir.S2MM, b_in_channel, dest=block["b_in_ping"], chain=block["c_out"])
+                        dma_start(
+                            DMAChannelDir.S2MM,
+                            b_in_channel,
+                            dest=block["b_in_ping"],
+                            chain=block["c_out"],
+                        )
                         for pp in ["ping", "pong"]:
                             with block[f"b_in_{pp}"]:
-                                use_lock(B_l2l1_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
-                                dma_bd(B_l2l1_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(B_l1_memref_ty))
-                                use_lock(B_l2l1_fifo["get_lock"], LockAction.Release, value=1)
-                                next_bd(block[f"b_in_{'pong' if pp == 'ping' else 'ping'}"])
-                    
+                                use_lock(
+                                    B_l2l1_fifo["put_lock"],
+                                    LockAction.AcquireGreaterEqual,
+                                    value=1,
+                                )
+                                dma_bd(
+                                    B_l2l1_fifo[f"{pp}_buf"],
+                                    offset=0,
+                                    len=get_memref_len_elems(B_l1_memref_ty),
+                                )
+                                use_lock(
+                                    B_l2l1_fifo["get_lock"], LockAction.Release, value=1
+                                )
+                                next_bd(
+                                    block[f"b_in_{'pong' if pp == 'ping' else 'ping'}"]
+                                )
+
                     # C output
                     with block["c_out"]:
                         C_l1l2_fifo = C_l1l2_fifos[row][col]["prod"]
                         _, _, c_out_channel = C_l1l2_fifo["endpoint"]
                         _ = block["c_out_ping"], block["c_out_pong"]
-                        dma_start(DMAChannelDir.MM2S, c_out_channel, dest=block["c_out_ping"], chain=block["end"])
+                        dma_start(
+                            DMAChannelDir.MM2S,
+                            c_out_channel,
+                            dest=block["c_out_ping"],
+                            chain=block["end"],
+                        )
                         for pp in ["ping", "pong"]:
                             with block[f"c_out_{pp}"]:
-                                use_lock(C_l1l2_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
-                                dma_bd(C_l1l2_fifo[f"{pp}_buf"], offset=0, len=get_memref_len_elems(C_l1_memref_ty))
-                                use_lock(C_l1l2_fifo["put_lock"], LockAction.Release, value=1)
-                                next_bd(block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"])
+                                use_lock(
+                                    C_l1l2_fifo["get_lock"],
+                                    LockAction.AcquireGreaterEqual,
+                                    value=1,
+                                )
+                                dma_bd(
+                                    C_l1l2_fifo[f"{pp}_buf"],
+                                    offset=0,
+                                    len=get_memref_len_elems(C_l1_memref_ty),
+                                )
+                                use_lock(
+                                    C_l1l2_fifo["put_lock"], LockAction.Release, value=1
+                                )
+                                next_bd(
+                                    block[f"c_out_{'pong' if pp == 'ping' else 'ping'}"]
+                                )
 
                     with block["end"]:
                         EndOp()
 
-
-
         # Set up compute tiles
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
@@ -515,49 +806,76 @@ def core_body():
                     c_1 = index_dialect.constant(1)
                     c_2 = index_dialect.constant(2)
                     c_maxint = index_dialect.constant(0xFFFFFFFF)
-                    
-                    run_loop = ForOp(lower_bound=c_0,
-                                     upper_bound=c_maxint,
-                                     step=c_1,
-                                     iter_args=[c_0])
+
+                    run_loop = ForOp(
+                        lower_bound=c_0, upper_bound=c_maxint, step=c_1, iter_args=[c_0]
+                    )
                     with InsertionPoint(run_loop.body):
                         c_pp_outer = run_loop.inner_iter_args[0]
 
                         # Wait for "ready" signal through RTP and read RTP.
                         call(await_rtp, [rtp_bufs[row][col]])
-                        rtp_K_div_k_div_2_i32 = call(get_volatile_rtp, [rtp_bufs[row][col], 1])
-                        rtp_K_div_k_div_2 = index_dialect.castu(T.index(), rtp_K_div_k_div_2_i32)
-                        rtp_n_tiles_per_core_i32 = call(get_volatile_rtp, [rtp_bufs[row][col], 2])
-                        rtp_n_tiles_per_core = index_dialect.castu(T.index(), rtp_n_tiles_per_core_i32)
+                        rtp_K_div_k_div_2_i32 = call(
+                            get_volatile_rtp, [rtp_bufs[row][col], 1]
+                        )
+                        rtp_K_div_k_div_2 = index_dialect.castu(
+                            T.index(), rtp_K_div_k_div_2_i32
+                        )
+                        rtp_n_tiles_per_core_i32 = call(
+                            get_volatile_rtp, [rtp_bufs[row][col], 2]
+                        )
+                        rtp_n_tiles_per_core = index_dialect.castu(
+                            T.index(), rtp_n_tiles_per_core_i32
+                        )
 
                         tile_loop = for_(rtp_n_tiles_per_core, iter_args=[T.index()])
-                        tile_loop = ForOp(lower_bound=c_0,
-                                          upper_bound=rtp_n_tiles_per_core,
-                                          step=c_1,
-                                          iter_args=[c_pp_outer])
+                        tile_loop = ForOp(
+                            lower_bound=c_0,
+                            upper_bound=rtp_n_tiles_per_core,
+                            step=c_1,
+                            iter_args=[c_pp_outer],
+                        )
                         with InsertionPoint(tile_loop.body):
-                            c_pp_inner = tile_loop.inner_iter_args[0]  # this variable flips between 0 and 1 each iteration
-                            c_pp_cond = index_dialect.cmp('eq', c_pp_inner, c_0)
+                            c_pp_inner = tile_loop.inner_iter_args[
+                                0
+                            ]  # this variable flips between 0 and 1 each iteration
+                            c_pp_cond = index_dialect.cmp("eq", c_pp_inner, c_0)
                             ifop = IfOp(c_pp_cond, [C_l1_memref_ty], hasElse=True)
-                            #ifop.thenRegion.blocks.append()
+                            # ifop.thenRegion.blocks.append()
                             with InsertionPoint(ifop.thenRegion.blocks[0]):
                                 yield_([C_fifo["ping_buf"]])
-                            #ifop.elseRegion.blocks.append()
+                            # ifop.elseRegion.blocks.append()
                             with InsertionPoint(ifop.elseRegion.blocks[0]):
                                 yield_([C_fifo["pong_buf"]])
 
-                            use_lock(C_fifo["put_lock"], LockAction.AcquireGreaterEqual, value=1)
+                            use_lock(
+                                C_fifo["put_lock"],
+                                LockAction.AcquireGreaterEqual,
+                                value=1,
+                            )
                             elem_out = ifop.results_[0]
                             call(zero, [elem_out])
                             for j in for_(rtp_K_div_k_div_2):
                                 for ab_pp in ["ping", "pong"]:
-                                    use_lock(A_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
-                                    use_lock(B_fifo["get_lock"], LockAction.AcquireGreaterEqual, value=1)
+                                    use_lock(
+                                        A_fifo["get_lock"],
+                                        LockAction.AcquireGreaterEqual,
+                                        value=1,
+                                    )
+                                    use_lock(
+                                        B_fifo["get_lock"],
+                                        LockAction.AcquireGreaterEqual,
+                                        value=1,
+                                    )
                                     elem_in_a = A_fifo[f"{ab_pp}_buf"]
                                     elem_in_b = B_fifo[f"{ab_pp}_buf"]
                                     call(matmul, [elem_in_a, elem_in_b, elem_out])
-                                    use_lock(A_fifo["put_lock"], LockAction.Release, value=1)
-                                    use_lock(B_fifo["put_lock"], LockAction.Release, value=1)
+                                    use_lock(
+                                        A_fifo["put_lock"], LockAction.Release, value=1
+                                    )
+                                    use_lock(
+                                        B_fifo["put_lock"], LockAction.Release, value=1
+                                    )
                                 yield_([])
                             use_lock(C_fifo["get_lock"], LockAction.Release, value=1)
 
@@ -568,16 +886,12 @@ def core_body():
                         yield_([tile_loop.results_[0]])
 
         # To/from AIE-array data movement
-        @runtime_sequence(
-            A_l3_memref_ty,
-            B_l3_memref_ty,
-            C_l3_memref_ty
-        )
+        @runtime_sequence(A_l3_memref_ty, B_l3_memref_ty, C_l3_memref_ty)
         def sequence(A, B, C):
             # Write number of inner loop iterations for cores to use as run-time parameter.
             # This allows for processing different problem sizes by only swapping the insts.txt.
-            assert (K//k)%2 == 0
-            rtp_K_div_k_div_2 = K//k//2
+            assert (K // k) % 2 == 0
+            rtp_K_div_k_div_2 = K // k // 2
             for row in range(n_aie_rows):
                 for col in range(n_aie_cols):
                     sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name()[1:])
@@ -626,7 +940,9 @@ def sequence(A, B, C):
                         C_col_offset = col * n
                         C_offset = C_col_offset + C_row_offset
                         npu_dma_memcpy_nd(
-                            metadata=C_l2l3_fifos[col]["cons"]["shim_dma_alloc"].sym_name.value,
+                            metadata=C_l2l3_fifos[col]["cons"][
+                                "shim_dma_alloc"
+                            ].sym_name.value,
                             bd_id=bd_id_base,
                             mem=C,
                             offsets=[0, 0, 0, C_offset],
@@ -662,7 +978,9 @@ def sequence(A, B, C):
                             )  # base address for the shim in this column
                             A_offset = A_block_offset + A_row_offset
                             npu_dma_memcpy_nd(
-                                metadata=A_l3l2_fifos[col]["prod"]["shim_dma_alloc"].sym_name.value,
+                                metadata=A_l3l2_fifos[col]["prod"][
+                                    "shim_dma_alloc"
+                                ].sym_name.value,
                                 bd_id=bd_id_base + 2 * tile_row + 1,
                                 mem=A,
                                 offsets=[0, 0, 0, A_offset],
@@ -695,12 +1013,14 @@ def sequence(A, B, C):
                             #      ----------------
                             B_col_offset = col * n * K
                             npu_dma_memcpy_nd(
-                                metadata=B_l3l2_fifos[col]["prod"]["shim_dma_alloc"].sym_name.value,
+                                metadata=B_l3l2_fifos[col]["prod"][
+                                    "shim_dma_alloc"
+                                ].sym_name.value,
                                 bd_id=bd_id_base + 2 * tile_row + 2,
                                 mem=B,
                                 offsets=[0, 0, 0, B_col_offset],
-                                sizes=[N // n // n_aie_cols,    K // k,   n,  k],
-                                strides=[n * n_aie_cols * K,         k,   K,  1],
+                                sizes=[N // n // n_aie_cols, K // k, n, k],
+                                strides=[n * n_aie_cols * K, k, K, 1],
                             )
                     if tb > 0 or (tb == 0 and pingpong > 0):
                         for col in range(n_aie_cols):
diff --git a/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc b/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
index cce1d5a005..e83d8de551 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
+++ b/programming_examples/basic/matrix_multiplication/rtp/await_rtp.cc
@@ -13,26 +13,24 @@
 
 extern "C" {
 /* Polls a run-time parameter to be set to a value other than -1.
-   
+
    Dedicate one RTP as the "ready" signal. Once you have set all other RTPs,
    set this RTP to 1, and this function will unblock. In the core, you can
    then read the other RTPs *after* this function unblocks.
 
    Note: There is a small race condition here if the host sets the "ready"
    RTP *before* the core calls this function. This is unlikely to happen if
-   this function is the first thing called in the core, as the core 
-   executes much faster than the host controller can set values in core 
+   this function is the first thing called in the core, as the core
+   executes much faster than the host controller can set values in core
    memory.
    */
 void await_rtp(volatile int *rtp) {
   rtp[0] = -1;
-  while(rtp[0] == -1);
-}
-
-int get_volatile_rtp(volatile int *rtp, int index) {
-  return rtp[index];
+  while (rtp[0] == -1)
+    ;
 }
 
+int get_volatile_rtp(volatile int *rtp, int index) { return rtp[index]; }
 }
 
 #endif
\ No newline at end of file
diff --git a/programming_examples/basic/matrix_multiplication/rtp/kernel.cc b/programming_examples/basic/matrix_multiplication/rtp/kernel.cc
index 905556eece..1ed81c725b 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/kernel.cc
+++ b/programming_examples/basic/matrix_multiplication/rtp/kernel.cc
@@ -22,8 +22,8 @@
 
 #include <aie_api/aie.hpp>
 
-#include "zero.cc"
 #include "await_rtp.cc"
+#include "zero.cc"
 
 template <typename T_in, typename T_out, int rowA, int colA, int colB>
 void matmul_scalar(T_in *a, T_in *b, T_out *c) {
@@ -75,9 +75,11 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
           pA1 += MMUL::size_A;
           aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
           pA2 += MMUL::size_A;
-          aie::vector<T_in, MMUL::size_B> B0 = aie::transpose(aie::load_v<MMUL::size_B>(pB1), t, s);
+          aie::vector<T_in, MMUL::size_B> B0 =
+              aie::transpose(aie::load_v<MMUL::size_B>(pB1), t, s);
           pB1 += MMUL::size_B;
-          aie::vector<T_in, MMUL::size_B> B1 = aie::transpose(aie::load_v<MMUL::size_B>(pB2), t, s);
+          aie::vector<T_in, MMUL::size_B> B1 =
+              aie::transpose(aie::load_v<MMUL::size_B>(pB2), t, s);
           pB2 += MMUL::size_B;
 
           // We modify the library documentation implementation to accumulate
@@ -155,8 +157,7 @@ extern "C" {
 // These dimensions must be divisible by the r, s, t dimensions used in
 // the kernels.
 
-#define combos(X)                                                              \
-  X(bfloat16, bf16, float, f32, 4, 8, 4)
+#define combos(X) X(bfloat16, bf16, float, f32, 4, 8, 4)
 
 #define matmul_vectorized_c_func(ctype_in, mlir_type_in, ctype_out,            \
                                  mlir_type_out, r, s, t)                       \
diff --git a/programming_examples/basic/matrix_multiplication/rtp/test.cpp b/programming_examples/basic/matrix_multiplication/rtp/test.cpp
index c54b19e4b6..c6f42f5d06 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/rtp/test.cpp
@@ -1,21 +1,20 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdint.h>
 #include <assert.h>
+#include <ctype.h>
+#include <fstream>
+#include <map>
 #include <math.h>
-#include <time.h>
-#include <string.h>
-#include <unistd.h>
 #include <stdfloat>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
 #include <tuple>
-#include <map>
-#include <fstream>
+#include <unistd.h>
 #include <xrt/xrt_bo.h>
 #include <xrt/xrt_device.h>
 #include <xrt/xrt_kernel.h>
 
-
 #define VALIDATE 1
 
 #define AIEML_MAX_OFFLOAD_M 3072
@@ -23,10 +22,8 @@
 #define AIEML_MAX_OFFLOAD_N 3072
 #define AIE_MAX_INSTR_LEN 4096
 
-
 #include "../common.h"
 
-
 // --------------------------------------------------------------------------
 // AIE initialization stuff
 // The following structures and functions are used to initialize multiple
@@ -37,40 +34,40 @@
 #define AIE_MAX_N_BOS 4
 
 union aie_bo_map {
-    char *i8;
-    int *i32;
-    std::bfloat16_t *bf16;
-    float *f32;
+  char *i8;
+  int *i32;
+  std::bfloat16_t *bf16;
+  float *f32;
 };
 
 enum aie_bo_dir { IN_ONLY, OUT_ONLY, IN_OUT };
 
 struct aie_bo {
-    xrt::bo *bo;
-    int group_id;
-    enum aie_bo_dir dir;
-    size_t len;  // in bytes
-    union aie_bo_map buf;
+  xrt::bo *bo;
+  int group_id;
+  enum aie_bo_dir dir;
+  size_t len; // in bytes
+  union aie_bo_map buf;
 };
 
 struct aie_global_state {
-    xrt::device *device;
+  xrt::device *device;
 };
 
 struct aie_state {
-    std::string xclbin_path;
-    std::string kernel_name;
-    xrt::xclbin *xclbin;
-    xrt::hw_context *context;
-    xrt::kernel *kernel;
-    size_t n_bos;
-    struct aie_bo bos[AIE_MAX_N_BOS];
-    std::vector<uint32_t> *last_loaded_insts; // don't reload if they're the same
-    int instr_len;
+  std::string xclbin_path;
+  std::string kernel_name;
+  xrt::xclbin *xclbin;
+  xrt::hw_context *context;
+  xrt::kernel *kernel;
+  size_t n_bos;
+  struct aie_bo bos[AIE_MAX_N_BOS];
+  std::vector<uint32_t> *last_loaded_insts; // don't reload if they're the same
+  int instr_len;
 };
 
 struct aie_offload_gemm_info {
-    std::vector<uint32_t> *insts;
+  std::vector<uint32_t> *insts;
 };
 
 struct aie_global_state aie_global;
@@ -105,295 +102,341 @@ std::vector<uint32_t> aie_load_instr_sequence(std::string instr_path) {
 }
 
 void aie_init_global() {
-    // Set up device
-    unsigned int device_index = 0;
-    aie_global.device = new xrt::device(device_index);
+  // Set up device
+  unsigned int device_index = 0;
+  aie_global.device = new xrt::device(device_index);
 }
 
 void aie_init_design(struct aie_state *aie_state) {
-    // Load xclbin
-    constexpr int verbosity = 1;
-    if (verbosity >= 1) { std::cout << "Loading xclbin: " << aie_state->xclbin_path << "\n"; }
-    aie_state->xclbin = new xrt::xclbin(aie_state->xclbin_path);
-    if (verbosity >= 1) { std::cout << "Kernel opcode: " << aie_state->kernel_name << "\n"; }
-    auto xkernels = aie_state->xclbin->get_kernels();
-    auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
-                                 [aie_state](xrt::xclbin::kernel &k) { return k.get_name().rfind(aie_state->kernel_name, 0) == 0; });
-    auto kernel_name = xkernel.get_name();
-    if (verbosity >= 1) { std::cout << "Registering xclbin: " << aie_state->xclbin_path << "\n"; }
-    aie_global.device->register_xclbin(*aie_state->xclbin);
-    if (verbosity >= 1) { std::cout << "Getting hardware context.\n"; }
-    if (verbosity >= 1) { std::cout << aie_state->xclbin->get_uuid().to_string() << std::endl; }
-    aie_state->context = new xrt::hw_context(*aie_global.device, aie_state->xclbin->get_uuid());
-    if (verbosity >= 1) { std::cout << "Getting handle to kernel:" << kernel_name << "\n"; }
-    aie_state->kernel = new xrt::kernel(*aie_state->context, kernel_name);
-
-    assert(aie_state->n_bos >= 1 && aie_state->n_bos <= AIE_MAX_N_BOS); // buffer 1 is insts buffer
-    aie_state->bos[0].len = AIE_MAX_INSTR_LEN * sizeof(int);
-    aie_state->bos[0].group_id = 1;
-    aie_state->bos[0].bo = new xrt::bo(*aie_global.device, aie_state->bos[0].len, XCL_BO_FLAGS_CACHEABLE, aie_state->kernel->group_id(aie_state->bos[0].group_id));
-    aie_state->bos[0].buf.i32 = aie_state->bos[0].bo->map<int *>();
-
-    for(int i = 1; i < aie_state->n_bos; i++) {
-        aie_state->bos[i].group_id = i + 2;  // 1 is insts, 2 is insts_len, other buffers start at 3
-        aie_state->bos[i].bo = new xrt::bo(*aie_global.device, aie_state->bos[i].len, XRT_BO_FLAGS_HOST_ONLY, aie_state->kernel->group_id(aie_state->bos[i].group_id));
-        aie_state->bos[i].buf.i8 = aie_state->bos[i].bo->map<char *>();
-    }
+  // Load xclbin
+  constexpr int verbosity = 1;
+  if (verbosity >= 1) {
+    std::cout << "Loading xclbin: " << aie_state->xclbin_path << "\n";
+  }
+  aie_state->xclbin = new xrt::xclbin(aie_state->xclbin_path);
+  if (verbosity >= 1) {
+    std::cout << "Kernel opcode: " << aie_state->kernel_name << "\n";
+  }
+  auto xkernels = aie_state->xclbin->get_kernels();
+  auto xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [aie_state](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(aie_state->kernel_name, 0) == 0;
+      });
+  auto kernel_name = xkernel.get_name();
+  if (verbosity >= 1) {
+    std::cout << "Registering xclbin: " << aie_state->xclbin_path << "\n";
+  }
+  aie_global.device->register_xclbin(*aie_state->xclbin);
+  if (verbosity >= 1) {
+    std::cout << "Getting hardware context.\n";
+  }
+  if (verbosity >= 1) {
+    std::cout << aie_state->xclbin->get_uuid().to_string() << std::endl;
+  }
+  aie_state->context =
+      new xrt::hw_context(*aie_global.device, aie_state->xclbin->get_uuid());
+  if (verbosity >= 1) {
+    std::cout << "Getting handle to kernel:" << kernel_name << "\n";
+  }
+  aie_state->kernel = new xrt::kernel(*aie_state->context, kernel_name);
+
+  assert(aie_state->n_bos >= 1 &&
+         aie_state->n_bos <= AIE_MAX_N_BOS); // buffer 1 is insts buffer
+  aie_state->bos[0].len = AIE_MAX_INSTR_LEN * sizeof(int);
+  aie_state->bos[0].group_id = 1;
+  aie_state->bos[0].bo = new xrt::bo(
+      *aie_global.device, aie_state->bos[0].len, XCL_BO_FLAGS_CACHEABLE,
+      aie_state->kernel->group_id(aie_state->bos[0].group_id));
+  aie_state->bos[0].buf.i32 = aie_state->bos[0].bo->map<int *>();
+
+  for (int i = 1; i < aie_state->n_bos; i++) {
+    aie_state->bos[i].group_id =
+        i + 2; // 1 is insts, 2 is insts_len, other buffers start at 3
+    aie_state->bos[i].bo = new xrt::bo(
+        *aie_global.device, aie_state->bos[i].len, XRT_BO_FLAGS_HOST_ONLY,
+        aie_state->kernel->group_id(aie_state->bos[i].group_id));
+    aie_state->bos[i].buf.i8 = aie_state->bos[i].bo->map<char *>();
+  }
 }
 
 std::vector<uint32_t> load_insts(const char *insts_txt_path) {
-    // Load instructions
-    constexpr int verbosity = 0;
-    std::vector<uint32_t> instr_v = aie_load_instr_sequence(insts_txt_path);
-    if (verbosity >= 1) { std::cout << "Sequence instr count: " << instr_v.size() << "\n"; }
-    assert(instr_v.size() < AIE_MAX_INSTR_LEN);
-    return std::move(instr_v);
+  // Load instructions
+  constexpr int verbosity = 0;
+  std::vector<uint32_t> instr_v = aie_load_instr_sequence(insts_txt_path);
+  if (verbosity >= 1) {
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+  }
+  assert(instr_v.size() < AIE_MAX_INSTR_LEN);
+  return std::move(instr_v);
 }
 
-void aie_init_insts(struct aie_state *aie_state, std::vector<uint32_t> *instr_v) {
-    if(instr_v == aie_state->last_loaded_insts) {
-        return;
-    }
-    memset(aie_state->bos[0].buf.i8, 0, AIE_MAX_INSTR_LEN * sizeof(int));
-    memcpy(aie_state->bos[0].buf.i8, instr_v->data(), instr_v->size() * sizeof(int));
-    aie_state->bos[0].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
-    aie_state->instr_len = instr_v->size();
+void aie_init_insts(struct aie_state *aie_state,
+                    std::vector<uint32_t> *instr_v) {
+  if (instr_v == aie_state->last_loaded_insts) {
+    return;
+  }
+  memset(aie_state->bos[0].buf.i8, 0, AIE_MAX_INSTR_LEN * sizeof(int));
+  memcpy(aie_state->bos[0].buf.i8, instr_v->data(),
+         instr_v->size() * sizeof(int));
+  aie_state->bos[0].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  aie_state->instr_len = instr_v->size();
 }
 
 void aie_run_design(struct aie_state *aie_state) {
-    // bos[0] is synced in init function
-    for(int i = 1; i < aie_state->n_bos; i++) {
-        if(aie_state->bos[i].dir == OUT_ONLY) {
-            continue;
-        }
-        aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  // bos[0] is synced in init function
+  for (int i = 1; i < aie_state->n_bos; i++) {
+    if (aie_state->bos[i].dir == OUT_ONLY) {
+      continue;
     }
-    unsigned int opcode = 3;
-    auto run = (*aie_state->kernel)(opcode, *aie_state->bos[0].bo, aie_state->instr_len, *aie_state->bos[1].bo, *aie_state->bos[2].bo, *aie_state->bos[3].bo);
-    ert_cmd_state r = run.wait();
-    if (r != ERT_CMD_STATE_COMPLETED) {
-        std::cout << "AIE Error Status: " << r << std::endl;
-        exit(1);
-    }
-    for(int i = 1; i < aie_state->n_bos; i++) {
-        if(aie_state->bos[i].dir == IN_ONLY) {
-            continue;
-        }
-        aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  }
+  unsigned int opcode = 3;
+  auto run = (*aie_state->kernel)(opcode, *aie_state->bos[0].bo,
+                                  aie_state->instr_len, *aie_state->bos[1].bo,
+                                  *aie_state->bos[2].bo, *aie_state->bos[3].bo);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "AIE Error Status: " << r << std::endl;
+    exit(1);
+  }
+  for (int i = 1; i < aie_state->n_bos; i++) {
+    if (aie_state->bos[i].dir == IN_ONLY) {
+      continue;
     }
+    aie_state->bos[i].bo->sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  }
 }
 
 void aie_init() {
-    aie_init_global();
-
-    // GEMM design
-    aie_gemm_256x768x2304_insts = load_insts("build/insts_256x768x2304.txt");
-    aie_gemm_256x768x768_insts = load_insts("build/insts_256x768x768.txt");
-    aie_gemm_256x768x3072_insts = load_insts("build/insts_256x768x3072.txt");
-    aie_gemm_256x3072x768_insts = load_insts("build/insts_256x3072x768.txt");
-    aie_gemm_768x256x3072_insts = load_insts("build/insts_768x256x3072.txt");
-    aie_gemm_3072x256x768_insts = load_insts("build/insts_3072x256x768.txt");
-    aie_gemm_768x256x768_insts =  load_insts("build/insts_768x256x768.txt");
-    aie_gemm_256x2304x768_insts = load_insts("build/insts_256x2304x768.txt");
-    aie_gemm_2304x256x768_insts = load_insts("build/insts_2304x256x768.txt");
-    aie_gemm.xclbin_path = "build/final.xclbin";
-    aie_gemm.kernel_name = "MLIR_AIE";
-    aie_gemm.n_bos = 4;
-    aie_gemm.bos[1].len = AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_K*sizeof(std::bfloat16_t);
-    aie_gemm.bos[1].dir = IN_ONLY;
-    aie_gemm.bos[2].len = AIEML_MAX_OFFLOAD_K*AIEML_MAX_OFFLOAD_N*sizeof(std::bfloat16_t);
-    aie_gemm.bos[2].dir = IN_ONLY;
-    aie_gemm.bos[3].len = AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N*sizeof(float);
-    aie_gemm.bos[3].dir = IN_OUT;
-    aie_init_design(&aie_gemm);
-
-    aie_offload[std::make_tuple(256, 768, 2304)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x2304_insts };
-    aie_offload[std::make_tuple(256, 768, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x768_insts };
-    aie_offload[std::make_tuple(256, 768, 3072)] = (struct aie_offload_gemm_info){ &aie_gemm_256x768x3072_insts };
-    aie_offload[std::make_tuple(256, 3072, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x3072x768_insts };
-    aie_offload[std::make_tuple(768, 256, 3072)] = (struct aie_offload_gemm_info){ &aie_gemm_768x256x3072_insts };
-    aie_offload[std::make_tuple(3072, 256, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_3072x256x768_insts };
-    aie_offload[std::make_tuple(768, 256, 768)] =  (struct aie_offload_gemm_info){ &aie_gemm_768x256x768_insts };
-    aie_offload[std::make_tuple(256, 2304, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_256x2304x768_insts };
-    aie_offload[std::make_tuple(2304, 256, 768)] = (struct aie_offload_gemm_info){ &aie_gemm_2304x256x768_insts };
-
+  aie_init_global();
+
+  // GEMM design
+  aie_gemm_256x768x2304_insts = load_insts("build/insts_256x768x2304.txt");
+  aie_gemm_256x768x768_insts = load_insts("build/insts_256x768x768.txt");
+  aie_gemm_256x768x3072_insts = load_insts("build/insts_256x768x3072.txt");
+  aie_gemm_256x3072x768_insts = load_insts("build/insts_256x3072x768.txt");
+  aie_gemm_768x256x3072_insts = load_insts("build/insts_768x256x3072.txt");
+  aie_gemm_3072x256x768_insts = load_insts("build/insts_3072x256x768.txt");
+  aie_gemm_768x256x768_insts = load_insts("build/insts_768x256x768.txt");
+  aie_gemm_256x2304x768_insts = load_insts("build/insts_256x2304x768.txt");
+  aie_gemm_2304x256x768_insts = load_insts("build/insts_2304x256x768.txt");
+  aie_gemm.xclbin_path = "build/final.xclbin";
+  aie_gemm.kernel_name = "MLIR_AIE";
+  aie_gemm.n_bos = 4;
+  aie_gemm.bos[1].len =
+      AIEML_MAX_OFFLOAD_M * AIEML_MAX_OFFLOAD_K * sizeof(std::bfloat16_t);
+  aie_gemm.bos[1].dir = IN_ONLY;
+  aie_gemm.bos[2].len =
+      AIEML_MAX_OFFLOAD_K * AIEML_MAX_OFFLOAD_N * sizeof(std::bfloat16_t);
+  aie_gemm.bos[2].dir = IN_ONLY;
+  aie_gemm.bos[3].len =
+      AIEML_MAX_OFFLOAD_M * AIEML_MAX_OFFLOAD_N * sizeof(float);
+  aie_gemm.bos[3].dir = IN_OUT;
+  aie_init_design(&aie_gemm);
+
+  aie_offload[std::make_tuple(256, 768, 2304)] =
+      (struct aie_offload_gemm_info){&aie_gemm_256x768x2304_insts};
+  aie_offload[std::make_tuple(256, 768, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_256x768x768_insts};
+  aie_offload[std::make_tuple(256, 768, 3072)] =
+      (struct aie_offload_gemm_info){&aie_gemm_256x768x3072_insts};
+  aie_offload[std::make_tuple(256, 3072, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_256x3072x768_insts};
+  aie_offload[std::make_tuple(768, 256, 3072)] =
+      (struct aie_offload_gemm_info){&aie_gemm_768x256x3072_insts};
+  aie_offload[std::make_tuple(3072, 256, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_3072x256x768_insts};
+  aie_offload[std::make_tuple(768, 256, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_768x256x768_insts};
+  aie_offload[std::make_tuple(256, 2304, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_256x2304x768_insts};
+  aie_offload[std::make_tuple(2304, 256, 768)] =
+      (struct aie_offload_gemm_info){&aie_gemm_2304x256x768_insts};
 }
 
-
 // --------------------------------------------------------------------------
 // Main matmul implementation
 // --------------------------------------------------------------------------
 
-template<bool inp_is_col_major, bool weight_is_col_major>
-void aie_do_gemm(long M, long K, long N, const float * __restrict inp, const float * __restrict weight, const float * __restrict bias, float * __restrict out) {
-    auto info = aie_offload.find(std::make_tuple(M, K, N));
-
-    std::bfloat16_t *aie_buf_a = aie_gemm.bos[1].buf.bf16;
-    std::bfloat16_t *aie_buf_b = aie_gemm.bos[2].buf.bf16;
-    float *aie_buf_c = aie_gemm.bos[3].buf.f32;
-    // Copy over A
-    if(inp_is_col_major) {
-        // design expects inptus to be row major
-        for (long i = 0; i < M; i++) {
-            for (long j = 0; j < K; j++) {
-                aie_buf_a[i* K + j] = (std::bfloat16_t)inp[i + j * M];
-            }
-        }
-    } else {
-        for (long i = 0; i < M; i++) {
-            for (long j = 0; j < K; j++) {
-                aie_buf_a[i * K + j] = (std::bfloat16_t)inp[i * K + j];
-            }
-        }
+template <bool inp_is_col_major, bool weight_is_col_major>
+void aie_do_gemm(long M, long K, long N, const float *__restrict inp,
+                 const float *__restrict weight, const float *__restrict bias,
+                 float *__restrict out) {
+  auto info = aie_offload.find(std::make_tuple(M, K, N));
+
+  std::bfloat16_t *aie_buf_a = aie_gemm.bos[1].buf.bf16;
+  std::bfloat16_t *aie_buf_b = aie_gemm.bos[2].buf.bf16;
+  float *aie_buf_c = aie_gemm.bos[3].buf.f32;
+  // Copy over A
+  if (inp_is_col_major) {
+    // design expects inptus to be row major
+    for (long i = 0; i < M; i++) {
+      for (long j = 0; j < K; j++) {
+        aie_buf_a[i * K + j] = (std::bfloat16_t)inp[i + j * M];
+      }
     }
-    // Copy B
-    if(weight_is_col_major) {
-        // new design expects weight to be col major
-        for(long i = 0; i < K*N; i++) {
-            aie_buf_b[i] = (std::bfloat16_t)weight[i];
-        }
-    } else {
-        // need to transpose for row-major weights design
-        for (long i = 0; i < K; i++) {
-            for (long j = 0; j < N; j++) {
-                aie_buf_b[i + j*K] = (std::bfloat16_t)weight[i * N + j];
-            }
-        }
+  } else {
+    for (long i = 0; i < M; i++) {
+      for (long j = 0; j < K; j++) {
+        aie_buf_a[i * K + j] = (std::bfloat16_t)inp[i * K + j];
+      }
+    }
+  }
+  // Copy B
+  if (weight_is_col_major) {
+    // new design expects weight to be col major
+    for (long i = 0; i < K * N; i++) {
+      aie_buf_b[i] = (std::bfloat16_t)weight[i];
+    }
+  } else {
+    // need to transpose for row-major weights design
+    for (long i = 0; i < K; i++) {
+      for (long j = 0; j < N; j++) {
+        aie_buf_b[i + j * K] = (std::bfloat16_t)weight[i * N + j];
+      }
     }
+  }
 
-    // Run
-    aie_init_insts(&aie_gemm, info->second.insts);
-    aie_run_design(&aie_gemm);
+  // Run
+  aie_init_insts(&aie_gemm, info->second.insts);
+  aie_run_design(&aie_gemm);
 
-    // Write back results
-    memcpy(out, aie_buf_c, M*N*sizeof(out[0]));
+  // Write back results
+  memcpy(out, aie_buf_c, M * N * sizeof(out[0]));
 }
 
-
 // --------------------------------------------------------------------------
 // Verification
 // --------------------------------------------------------------------------
 
 // forward decl
-template<bool a_is_col_major, bool b_is_col_major>
-void matmul_reference(float *out, const float* a, const float* b, const float *bias,
-              long M, long K, long N);
-
-float out_ref[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N];
-template<bool inp_is_col_major, bool weight_is_col_major>
-bool validate_matmul(long M, long K, long N, const float * __restrict inp, const float * __restrict weight, const float * __restrict bias, float * __restrict out_to_test) {
-    matmul_reference<inp_is_col_major, weight_is_col_major>(out_ref, inp, weight, NULL, M, K, N);
-    std::vector<float> CRef(out_ref, out_ref+(M*N));
-    std::vector<float> C(out_to_test, out_to_test+(M*N));
-    int n_errors = 0;
-    std::vector<struct matmul_common::error<float>> errors;
-    float max_rel_error = (float)0.0f;
-    for (long row = 0; row < M; row++) {
-        for (long col = 0; col < N; col++) {
-            std::optional<struct matmul_common::error<float>> error = matmul_common::verify_single(
-                std::cout, row, col, CRef[row * N + col], C[row * N + col], 0.5, 0.05);
-            if (error.has_value()) {
-                if (n_errors < 10) {
-                    errors.push_back(*error);
-                }
-                float rel_error =
-                    std::abs(error->actual - error->expected) /
-                    std::max(std::abs(error->actual), std::abs(error->expected));
-                if (rel_error > max_rel_error) {
-                max_rel_error = rel_error;
-                }
-                n_errors++;
-            }
+template <bool a_is_col_major, bool b_is_col_major>
+void matmul_reference(float *out, const float *a, const float *b,
+                      const float *bias, long M, long K, long N);
+
+float out_ref[AIEML_MAX_OFFLOAD_M * AIEML_MAX_OFFLOAD_N];
+template <bool inp_is_col_major, bool weight_is_col_major>
+bool validate_matmul(long M, long K, long N, const float *__restrict inp,
+                     const float *__restrict weight,
+                     const float *__restrict bias,
+                     float *__restrict out_to_test) {
+  matmul_reference<inp_is_col_major, weight_is_col_major>(out_ref, inp, weight,
+                                                          NULL, M, K, N);
+  std::vector<float> CRef(out_ref, out_ref + (M * N));
+  std::vector<float> C(out_to_test, out_to_test + (M * N));
+  int n_errors = 0;
+  std::vector<struct matmul_common::error<float>> errors;
+  float max_rel_error = (float)0.0f;
+  for (long row = 0; row < M; row++) {
+    for (long col = 0; col < N; col++) {
+      std::optional<struct matmul_common::error<float>> error =
+          matmul_common::verify_single(std::cout, row, col, CRef[row * N + col],
+                                       C[row * N + col], 0.5, 0.05);
+      if (error.has_value()) {
+        if (n_errors < 10) {
+          errors.push_back(*error);
         }
+        float rel_error =
+            std::abs(error->actual - error->expected) /
+            std::max(std::abs(error->actual), std::abs(error->expected));
+        if (rel_error > max_rel_error) {
+          max_rel_error = rel_error;
+        }
+        n_errors++;
+      }
     }
-    if (n_errors > 0) {
-        matmul_common::print_error_summary(std::cout, n_errors, errors, max_rel_error);
-        std::cout << std::endl << "Reference:" << std::endl;
-        matmul_common::print_matrix(CRef, N);
-        std::cout << std::endl << "Output:" << std::endl;
-        matmul_common::print_matrix(C, N);
-        return false;
-    }
-    return true;
+  }
+  if (n_errors > 0) {
+    matmul_common::print_error_summary(std::cout, n_errors, errors,
+                                       max_rel_error);
+    std::cout << std::endl << "Reference:" << std::endl;
+    matmul_common::print_matrix(CRef, N);
+    std::cout << std::endl << "Output:" << std::endl;
+    matmul_common::print_matrix(C, N);
+    return false;
+  }
+  return true;
 }
 
-template<bool a_is_col_major, bool b_is_col_major>
-void matmul_reference(float *out, const float* a, const float* b, const float *bias,
-              long M, long K, long N)
-{
-    const int LOOP_UNROLL = 8;
-    assert(M % LOOP_UNROLL == 0);
-
-    for (int obt = 0; obt < M; obt += LOOP_UNROLL) {
-        for (int o = 0; o < N; o++) {
-            // we'll keep LOOP_UNROLL many results in registers
-            float result[LOOP_UNROLL];
-            // initialize the bias, if it exists
-            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
-                //result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
-                result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
-            }
-            // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
-            // the value of b[i + o * K] and reuse it.
-            // we compile with -Ofast, so the compiler will turn the inner loop into FMAs
-            for (int i = 0; i < K; i++) {
-                float w = (b_is_col_major ? b[i + o * K] : b[i * N + o]);
-                for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
-                    int bt = obt + ibt;
-                    float inp = (a_is_col_major ? a[bt + i * M] : a[bt * K + i]);
-                    result[ibt] += inp * w;
-                }
-            }
-            // write back results to main memory
-            for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
-                int bt = obt + ibt;
-                out[bt * N + o] = result[ibt];
-            }
+template <bool a_is_col_major, bool b_is_col_major>
+void matmul_reference(float *out, const float *a, const float *b,
+                      const float *bias, long M, long K, long N) {
+  const int LOOP_UNROLL = 8;
+  assert(M % LOOP_UNROLL == 0);
+
+  for (int obt = 0; obt < M; obt += LOOP_UNROLL) {
+    for (int o = 0; o < N; o++) {
+      // we'll keep LOOP_UNROLL many results in registers
+      float result[LOOP_UNROLL];
+      // initialize the bias, if it exists
+      for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+        // result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+        result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
+      }
+      // inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
+      // the value of b[i + o * K] and reuse it.
+      // we compile with -Ofast, so the compiler will turn the inner loop into
+      // FMAs
+      for (int i = 0; i < K; i++) {
+        float w = (b_is_col_major ? b[i + o * K] : b[i * N + o]);
+        for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+          int bt = obt + ibt;
+          float inp = (a_is_col_major ? a[bt + i * M] : a[bt * K + i]);
+          result[ibt] += inp * w;
         }
+      }
+      // write back results to main memory
+      for (int ibt = 0; ibt < LOOP_UNROLL; ibt++) {
+        int bt = obt + ibt;
+        out[bt * N + o] = result[ibt];
+      }
     }
+  }
 }
 
-
 // --------------------------------------------------------------------------
 // Main
 // --------------------------------------------------------------------------
 
-float A[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_K];
-float B[AIEML_MAX_OFFLOAD_K*AIEML_MAX_OFFLOAD_N];
-float C[AIEML_MAX_OFFLOAD_M*AIEML_MAX_OFFLOAD_N];
+float A[AIEML_MAX_OFFLOAD_M * AIEML_MAX_OFFLOAD_K];
+float B[AIEML_MAX_OFFLOAD_K * AIEML_MAX_OFFLOAD_N];
+float C[AIEML_MAX_OFFLOAD_M * AIEML_MAX_OFFLOAD_N];
 
 int main(int argc, char **argv) {
-    aie_init();
-    // do three iterations of switching between sizes
-    for(int i = 0; i < 3; i++) {  
-        for(auto it = aie_offload.begin(); it != aie_offload.end(); ++it) {
-            auto [M, K, N] = it->first;
-            for(int j = 0; j < M*K; j++) {
-                A[j] = matmul_common::get_random<std::bfloat16_t>();
-            }
-            for(int j = 0; j < K*N; j++) {
-                B[j] = matmul_common::get_random<std::bfloat16_t>();
-            }
-            printf("Running matmul: %4dx%4dx%4d ...", M, K, N);
-            fflush(stdout);
-            auto tstart = std::chrono::system_clock::now();
-            aie_do_gemm<false, true>(M, K, N, A, B, NULL, C);
-            auto tstop = std::chrono::system_clock::now();
-            float t = std::chrono::duration_cast<std::chrono::microseconds>(tstop - tstart).count();
-            printf(" complete after %6.0fus", t);
-            fflush(stdout);
-            #if VALIDATE
-            if(validate_matmul<false, true>(M, K, N, A, B, NULL, C)) {
-                printf(" - pass!\n");
-            } else {
-                printf("FAIL.\n");
-                exit(0);
-            }
-            #else
-            printf(" - not validated\n");
-            #endif
-        }
+  aie_init();
+  // do three iterations of switching between sizes
+  for (int i = 0; i < 3; i++) {
+    for (auto it = aie_offload.begin(); it != aie_offload.end(); ++it) {
+      auto [M, K, N] = it->first;
+      for (int j = 0; j < M * K; j++) {
+        A[j] = matmul_common::get_random<std::bfloat16_t>();
+      }
+      for (int j = 0; j < K * N; j++) {
+        B[j] = matmul_common::get_random<std::bfloat16_t>();
+      }
+      printf("Running matmul: %4dx%4dx%4d ...", M, K, N);
+      fflush(stdout);
+      auto tstart = std::chrono::system_clock::now();
+      aie_do_gemm<false, true>(M, K, N, A, B, NULL, C);
+      auto tstop = std::chrono::system_clock::now();
+      float t =
+          std::chrono::duration_cast<std::chrono::microseconds>(tstop - tstart)
+              .count();
+      printf(" complete after %6.0fus", t);
+      fflush(stdout);
+#if VALIDATE
+      if (validate_matmul<false, true>(M, K, N, A, B, NULL, C)) {
+        printf(" - pass!\n");
+      } else {
+        printf("FAIL.\n");
+        exit(0);
+      }
+#else
+      printf(" - not validated\n");
+#endif
     }
-    printf("PASS!\n"); // We will exit in aie_do_gemm above if verification does not pass.
-    return 0;
+  }
+  printf("PASS!\n"); // We will exit in aie_do_gemm above if verification does
+                     // not pass.
+  return 0;
 }

From 3bd016bc0de6f7014c4b1f3b3d2540bb57410220 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Mon, 18 Nov 2024 17:50:28 -0700
Subject: [PATCH 4/7] add lit test

---
 .../matrix_multiplication/rtp/run_makefile_chess.lit  | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 programming_examples/basic/matrix_multiplication/rtp/run_makefile_chess.lit

diff --git a/programming_examples/basic/matrix_multiplication/rtp/run_makefile_chess.lit b/programming_examples/basic/matrix_multiplication/rtp/run_makefile_chess.lit
new file mode 100644
index 0000000000..6b174c8c64
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/rtp/run_makefile_chess.lit
@@ -0,0 +1,11 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess 
+//
+// RUN: mkdir -p test_chess
+// RUN: cd test_chess
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!

From e31bc9ce9f0a0d882a49cb262e79d8c168c12225 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Tue, 19 Nov 2024 11:38:01 -0700
Subject: [PATCH 5/7] util.py no longer necessary after Python bindings for
 tasks landed

---
 .../basic/matrix_multiplication/rtp/aie2.py   |  2 -
 .../basic/matrix_multiplication/rtp/util.py   | 52 -------------------
 2 files changed, 54 deletions(-)
 delete mode 100644 programming_examples/basic/matrix_multiplication/rtp/util.py

diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
index e23361e92f..8a3a2c5726 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -17,8 +17,6 @@
 import aie.dialects.arith as arith_dialect
 import aie.dialects.memref as memref_dialect
 
-from util import *
-
 
 def get_memref_len_elems(memref):
     out = 1
diff --git a/programming_examples/basic/matrix_multiplication/rtp/util.py b/programming_examples/basic/matrix_multiplication/rtp/util.py
deleted file mode 100644
index 4e9a635d79..0000000000
--- a/programming_examples/basic/matrix_multiplication/rtp/util.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# These Python bindings are to be merged in #1699 in the future
-from typing import Optional, Union
-from aie.dialects.aie import *
-
-def get_dma_region_decorator(op_obj_constructor):
-    def decorator(f):
-        f_sig = inspect.signature(f)
-        op = op_obj_constructor()
-        entry_block = op.body.blocks.append()
-        bds_ctx = bds(op)
-        with InsertionPoint(entry_block):
-            with bds_ctx as bd:
-                if len(f_sig.parameters) == 0:
-                    f()
-                elif len(f_sig.parameters) == 1:
-                    f(bd)
-                else:
-                    raise RuntimeError(
-                        "Expected function to take zero or one argument(s)."
-                    )
-        return op
-
-    return decorator
-
-
-def mem(tile):
-    return get_dma_region_decorator(lambda: MemOp(T.index(), tile))
-
-
-def shim_mem(tile):
-    return get_dma_region_decorator(lambda: ShimDMAOp(T.index(), tile))
-
-
-def memtile_dma(tile):
-    return get_dma_region_decorator(lambda: MemTileDMAOp(T.index(), tile))
-
-
-def dma_start(
-    channel_dir,
-    channel_index,
-    *,
-    dest: Optional[Union[Successor, Block, ContextManagedBlock]] = None,
-    chain: Optional[Union[Successor, Block, ContextManagedBlock]] = None,
-    loc=None,
-    ip=None,
-):
-    chain_block = chain.block if isinstance(chain, ContextManagedBlock) else chain
-    dest_block = dest.block if isinstance(dest, ContextManagedBlock) else dest
-    op = DMAStartOp(
-        channel_dir, channel_index, dest=dest_block, chain=chain_block, loc=loc, ip=ip
-    )
-    return op.dest, op.chain
\ No newline at end of file

From 003eb0130b7e7fa3330fd979d9feb30598985b8a Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Tue, 19 Nov 2024 12:01:13 -0700
Subject: [PATCH 6/7] oops

---
 programming_examples/basic/matrix_multiplication/rtp/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
index 8a3a2c5726..2777c8702e 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -368,7 +368,7 @@ def device_body():
             B_l3l2_fifos[col] = {
                 "prod": {
                     "endpoint": (shim_tile, WireBundle.DMA, 1),
-                    "shim_memref_dialect": memref_dialect.global_(
+                    "shim_memref": memref_dialect.global_(
                         sym_name=f"B_L3L2_{col}",
                         sym_visibility="public",
                         type_=B_l3_memref_ty,

From 45b2260d650600d4b59a60e4df8ca1ab1176feb2 Mon Sep 17 00:00:00 2001
From: andrej <an.roesti@gmail.com>
Date: Thu, 21 Nov 2024 21:05:44 -0700
Subject: [PATCH 7/7] change rtp buffer name reference in anticipation of #1936

---
 programming_examples/basic/matrix_multiplication/rtp/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/matrix_multiplication/rtp/aie2.py b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
index 2777c8702e..d3464494bd 100644
--- a/programming_examples/basic/matrix_multiplication/rtp/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/rtp/aie2.py
@@ -892,7 +892,7 @@ def sequence(A, B, C):
             rtp_K_div_k_div_2 = K // k // 2
             for row in range(n_aie_rows):
                 for col in range(n_aie_cols):
-                    sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name()[1:])
+                    sym_ref = FlatSymbolRefAttr.get(rtp_bufs[row][col].get_name())
                     npu_rtp_write(sym_ref, 1, rtp_K_div_k_div_2)
                     npu_rtp_write(sym_ref, 2, n_tiles_per_core)
                     npu_rtp_write(sym_ref, 0, 1)  # indicate "ready"