From c7fdd9e85e7bb45b2a925e38e9ffb23994227a6a Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Thu, 10 Oct 2024 14:12:52 -0600
Subject: [PATCH 01/35] Strix passthrough

---
 .../basic/passthrough_kernel/Makefile              | 12 ++++++++++--
 .../basic/passthrough_kernel/aie2.py               | 14 ++++++++++++--
 programming_examples/makefile-common               |  1 +
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 361cfb2d3a..95aea8ad8c 100644
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -12,6 +12,7 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 include ${srcdir}/../../makefile-common
 
+devicename = npu
 targetname = passThroughKernel
 VPATH := ${srcdir}/../../../aie_kernels/generic
 data_size = 4096
@@ -24,7 +25,7 @@ all: build/final_${data_size}.xclbin
 
 build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< ${data_size} 0 > $@
+	python3 $< ${devicename} ${data_size} 0 > $@
 
 build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
@@ -32,7 +33,14 @@ build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py
 
 build/passThrough.cc.o: passThrough.cc
 	mkdir -p ${@D}
-	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+	ifeq ($(devicename), npu) 
+		cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+	else ifeq ($(devicename), npu2)
+			cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+		else
+			echo "Device type not supported"
+		endif
+	endif
 	
 build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 2bfdbb3066..c86d432622 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -19,8 +19,18 @@
 def passthroughKernel(vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
-
-    @device(AIEDevice.npu1_1col)
+    
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need command line arguments (Device name, Vector size)")
+
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[1] == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
     def device_body():
         # define types
         vector_ty = np.ndarray[(N,), np.dtype[np.uint8]]
diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common
index 4523fe7c5f..a8ee6431ca 100644
--- a/programming_examples/makefile-common
+++ b/programming_examples/makefile-common
@@ -11,6 +11,7 @@ CHESS_FLAGS = -P ${AIE_INCLUDE_DIR}
 
 CHESSCCWRAP1_FLAGS = aie -I ${AIETOOLS_DIR}/include 
 CHESSCCWRAP2_FLAGS = aie2 -I ${AIETOOLS_DIR}/include
+CHESSCCWRAP2P_FLAGS = aie2p -I ${AIETOOLS_DIR}/include 
 PEANOWRAP2_FLAGS = -O2 -v -std=c++20 --target=aie2-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIETOOLS_DIR}/include 
 
 TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no)

From a287f46e2d298b8be79a197aae11efd5d7247e35 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Fri, 11 Oct 2024 10:09:57 -0600
Subject: [PATCH 02/35] Fixup test

---
 .../basic/passthrough_kernel/Makefile         | 21 ++++++++--------
 .../basic/passthrough_kernel/aie2.py          | 25 ++++++++-----------
 .../passthrough_kernel/run_strix_makefile.lit | 10 ++++++++
 3 files changed, 31 insertions(+), 25 deletions(-)
 mode change 100644 => 100755 programming_examples/basic/passthrough_kernel/Makefile
 mode change 100644 => 100755 programming_examples/basic/passthrough_kernel/aie2.py
 create mode 100755 programming_examples/basic/passthrough_kernel/run_strix_makefile.lit

diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
old mode 100644
new mode 100755
index 95aea8ad8c..5f3f234acf
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -12,7 +12,7 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 include ${srcdir}/../../makefile-common
 
-devicename = npu
+device = npu
 targetname = passThroughKernel
 VPATH := ${srcdir}/../../../aie_kernels/generic
 data_size = 4096
@@ -25,22 +25,21 @@ all: build/final_${data_size}.xclbin
 
 build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< ${devicename} ${data_size} 0 > $@
+	python3 $< ${device} ${data_size} 0 > $@
 
 build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< ${data_size} ${trace_size} > $@
+	python3 $< ${device} ${data_size} ${trace_size} > $@
 
 build/passThrough.cc.o: passThrough.cc
 	mkdir -p ${@D}
-	ifeq ($(devicename), npu) 
-		cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
-	else ifeq ($(devicename), npu2)
-			cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
-		else
-			echo "Device type not supported"
-		endif
-	endif
+ifeq ($(device),npu)
+	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else ifeq ($(device),npu2)
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else
+	echo "Device type not supported"
+endif
 	
 build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
old mode 100644
new mode 100755
index c86d432622..6ea3c4bb72
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -16,20 +16,10 @@
 import aie.utils.trace as trace_utils
 
 
-def passthroughKernel(vector_size, trace_size):
+def passthroughKernel(dev, vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
     
-    if len(sys.argv) != 3:
-        raise ValueError("[ERROR] Need command line arguments (Device name, Vector size)")
-
-    if sys.argv[1] == "npu":
-        dev = AIEDevice.npu1_1col
-    elif sys.argv[1] == "npu2":
-        dev = AIEDevice.npu2
-    else:
-        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
     @device(dev)
     def device_body():
         # define types
@@ -95,13 +85,20 @@ def sequence(inTensor, outTensor, notUsed):
 
 
 try:
-    vector_size = int(sys.argv[1])
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    vector_size = int(sys.argv[2])
     if vector_size % 64 != 0 or vector_size < 512:
         print("Vector size must be a multiple of 64 and greater than or equal to 512")
         raise ValueError
-    trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
+    trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    passthroughKernel(vector_size, trace_size)
+    passthroughKernel(dev, vector_size, trace_size)
     print(ctx.module)
diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
new file mode 100755
index 0000000000..1818ba9813
--- /dev/null
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile device=npu2 
+// RUN: %run_on_npu make -f %S/Makefile run device=npu2 | FileCheck %s
+// CHECK: Running...
+// CHECK: PASS!

From 133ce6c536a198b9677fbe6f3d92e49eb2fc098a Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Thu, 24 Oct 2024 08:54:06 -0600
Subject: [PATCH 03/35] Dog

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 programming_examples/basic/passthrough_kernel/aie2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 6ea3c4bb72..3468f7e944 100755
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -19,7 +19,6 @@
 def passthroughKernel(dev, vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
-    
     @device(dev)
     def device_body():
         # define types

From fdbde789b3b3bc4dd50cfc96132fc27fc088dc1f Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Thu, 24 Oct 2024 08:57:48 -0600
Subject: [PATCH 04/35] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 programming_examples/basic/passthrough_kernel/aie2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 3468f7e944..b225295898 100755
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -19,6 +19,7 @@
 def passthroughKernel(dev, vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
+
     @device(dev)
     def device_body():
         # define types

From a576bd2dc56d1cd7add925968f92c1fc921e72b9 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 19:50:11 -0600
Subject: [PATCH 05/35] Fixup aiecc.py for npu/npu2

---
 programming_examples/basic/passthrough_kernel/Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 5f3f234acf..7e74ef39c1 100755
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -43,14 +43,21 @@ endif
 	
 build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
 		--no-xchesscc --no-xbridge \
 		--xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+else
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+endif
 
 build/final_trace_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+ifeq ($(device),npu)
 		--no-xchesscc --no-xbridge \
+endif
 		--xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
 
 ${targetname}_${data_size}.exe: ${srcdir}/test.cpp

From 83d705139c44dacedccbd55958afd557877a7e22 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 19:57:14 -0600
Subject: [PATCH 06/35] Try enabling run_on_npu2

---
 .../passthrough_kernel/run_strix_makefile.lit  |  2 +-
 programming_examples/lit.cfg.py                | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index 1818ba9813..81c8264aa0 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -5,6 +5,6 @@
 //
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
-// RUN: %run_on_npu make -f %S/Makefile run device=npu2 | FileCheck %s
+// RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s
 // CHECK: Running...
 // CHECK: PASS!
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 97e1246045..2368b262ea 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -46,6 +46,7 @@
 llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python"))
 
 run_on_npu = "echo"
+run_on_npu2 = "echo"
 xrt_flags = ""
 
 # Not using run_on_board anymore, need more specific per-platform commands
@@ -131,18 +132,25 @@
         result = result.stdout.decode("utf-8").split("\n")
         # Starting with Linux 6.8 the format is like "[0000:66:00.1]  :  RyzenAI-npu1"
         # Starting with Linux 6.10 the format is like "|[0000:41:00.1]  ||RyzenAI-npu1  |"
-        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu\d))")
+        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu1\d))")
+        p2 = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu4\d))")
         for l in result:
             m = p.match(l)
-            if not m:
+            m2 = p2.match(l)
+            if not m and not m2:
                 continue
             print("Found Ryzen AI device:", m.group(1))
             if len(m.groups()) == 3:
                 print("\tmodel:", m.group(3))
             config.available_features.add("ryzen_ai")
-            run_on_npu = (
-                f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
-            )
+            if m:
+              run_on_npu = (
+                 f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
+              )
+            if m2:
+              run_on_npu2 = (
+                 f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
+              )
             break
     except:
         print("Failed to run xrt-smi")

From 096d0bc1ceab45d3b1a9b54167faaab7a7a35d41 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Thu, 24 Oct 2024 20:07:28 -0600
Subject: [PATCH 07/35] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 programming_examples/lit.cfg.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 2368b262ea..07536d000b 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -144,13 +144,13 @@
                 print("\tmodel:", m.group(3))
             config.available_features.add("ryzen_ai")
             if m:
-              run_on_npu = (
-                 f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
-              )
+                run_on_npu = (
+                    f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
+                )
             if m2:
-              run_on_npu2 = (
-                 f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
-              )
+                run_on_npu2 = (
+                    f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
+                )
             break
     except:
         print("Failed to run xrt-smi")

From 6ea15fa88ab0e58f6022acdcb7c2211ac28554be Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 20:24:49 -0600
Subject: [PATCH 08/35] Add lit fixup

---
 programming_examples/lit.cfg.py         | 1 +
 programming_examples/lit.site.cfg.py.in | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 07536d000b..999351429b 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -159,6 +159,7 @@
     print("xrt not found")
 
 config.substitutions.append(("%run_on_npu", run_on_npu))
+config.substitutions.append(("%run_on_npu2", run_on_npu2))
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1"
diff --git a/programming_examples/lit.site.cfg.py.in b/programming_examples/lit.site.cfg.py.in
index 22a367d1fc..3ba7a457f1 100755
--- a/programming_examples/lit.site.cfg.py.in
+++ b/programming_examples/lit.site.cfg.py.in
@@ -69,6 +69,8 @@ if lit.util.pythonize_bool("@AIETools_AIE_FOUND@"):
     config.vitis_components.append("AIE")
 if lit.util.pythonize_bool("@AIETools_AIE2_FOUND@"):
     config.vitis_components.append("AIE2")
+if lit.util.pythonize_bool("@AIETools_AIE2P_FOUND@"):
+    config.vitis_components.append("AIE2P")
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.

From d1f543f2957cd86ea0157cfd1cceb7050d13d18b Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 20:29:27 -0600
Subject: [PATCH 09/35] Fixup lit cfg

---
 programming_examples/lit.cfg.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 999351429b..fcba808776 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -132,22 +132,20 @@
         result = result.stdout.decode("utf-8").split("\n")
         # Starting with Linux 6.8 the format is like "[0000:66:00.1]  :  RyzenAI-npu1"
         # Starting with Linux 6.10 the format is like "|[0000:41:00.1]  ||RyzenAI-npu1  |"
-        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu1\d))")
-        p2 = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu4\d))")
+        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu\d))")
         for l in result:
             m = p.match(l)
-            m2 = p2.match(l)
-            if not m and not m2:
+            if not m:
                 continue
             print("Found Ryzen AI device:", m.group(1))
             if len(m.groups()) == 3:
                 print("\tmodel:", m.group(3))
             config.available_features.add("ryzen_ai")
-            if m:
+            if m.group(3) == "npu1":
                 run_on_npu = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
-            if m2:
+            if m.group(3) == "npu4":
                 run_on_npu2 = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )

From 74c689cb5e7179d09d065e9234370c24909f1b49 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 20:51:03 -0600
Subject: [PATCH 10/35] Fix Makefile

---
 programming_examples/basic/passthrough_kernel/Makefile | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 7e74ef39c1..246da39f43 100755
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -54,11 +54,15 @@ endif
 
 build/final_trace_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
 ifeq ($(device),npu)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
 		--no-xchesscc --no-xbridge \
-endif
 		--xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+else
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
+endif
+
 
 ${targetname}_${data_size}.exe: ${srcdir}/test.cpp
 	rm -rf _build

From 85e19eeee79ef748ee3b240b1b7b4de5846705ab Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 20:55:22 -0600
Subject: [PATCH 11/35] Don't pollute source tree

---
 programming_examples/basic/passthrough_kernel/run_makefile.lit  | 2 ++
 .../basic/passthrough_kernel/run_strix_makefile.lit             | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/programming_examples/basic/passthrough_kernel/run_makefile.lit b/programming_examples/basic/passthrough_kernel/run_makefile.lit
index e8213c5d18..4619c215e6 100644
--- a/programming_examples/basic/passthrough_kernel/run_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_makefile.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, peano 
 //
+// RUN: mkdir -p test
+// RUN: cd test
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
 // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index 81c8264aa0..456e08c547 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess
 //
+// RUN: mkdir -p test_stx
+// RUN: cd test_stx
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
 // RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s

From 332a00ca6322903a8105ae4704a8190b49da7558 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Thu, 24 Oct 2024 21:11:09 -0600
Subject: [PATCH 12/35] fix

---
 programming_examples/lit.cfg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index fcba808776..efa77009a7 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -141,11 +141,11 @@
             if len(m.groups()) == 3:
                 print("\tmodel:", m.group(3))
             config.available_features.add("ryzen_ai")
-            if m.group(3) == "npu1":
+            if str(m.group(3)) == "npu1":
                 run_on_npu = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
-            if m.group(3) == "npu4":
+            if str(m.group(3)) == "npu4":
                 run_on_npu2 = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )

From 797fda9d5cf1a50f85e6705ba81280abf46645af Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Fri, 25 Oct 2024 13:56:14 -0600
Subject: [PATCH 13/35] Rely on test return value for PASS/fail

---
 .../basic/passthrough_kernel/run_strix_makefile.lit             | 2 --
 1 file changed, 2 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index 456e08c547..fc1d046b74 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -8,5 +8,3 @@
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
 // RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s
-// CHECK: Running...
-// CHECK: PASS!

From 23ca4b5c2fc9aae05f6ec6b59d4e0cd8512b798d Mon Sep 17 00:00:00 2001
From: "jgmelber@gmail.com" <Joe Melber>
Date: Mon, 2 Dec 2024 23:12:53 +0000
Subject: [PATCH 14/35] Generic mul kernel test on Strix

---
 aie_kernels/aie2/mul.cc                       |  6 +++---
 .../basic/vector_scalar_mul/Makefile          | 11 +++++++++--
 .../basic/vector_scalar_mul/aie2.py           | 19 +++++++++++++------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc
index 5745f364dc..0c4290034c 100755
--- a/aie_kernels/aie2/mul.cc
+++ b/aie_kernels/aie2/mul.cc
@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define __AIENGINE__ 2
-#define NOCPP
-#define __AIEARCH__ 20
+//#define __AIENGINE__ 2
+//#define NOCPP
+//#define __AIEARCH__ 20
 
 #include <stdint.h>
 #include <stdio.h>
diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
index 6d8906bd70..2a4c890c02 100644
--- a/programming_examples/basic/vector_scalar_mul/Makefile
+++ b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -14,6 +14,7 @@ include ${srcdir}/../../makefile-common
 
 VPATH := ${srcdir}/../../../aie_kernels/aie2
 
+device = npu
 targetname = vectorScalar
 data_size = 4096
 trace_size = 8192
@@ -25,19 +26,25 @@ kristof: build/insts_${data_size}.txt
 
 build/%.o: %.cc
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 ifeq ($(CHESS), true)
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}; 
 else 
 	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}; 
 endif
+else ifeq ($(device),npu2)
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else
+	echo "Device type not supported"
+endif
 
 build/aie_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< ${data_size} 0 > $@
+	python3 $< ${device} ${data_size} 0 > $@
 
 build/aie_trace_${data_size}.mlir: ${srcdir}/aie2.py
 	mkdir -p ${@D}
-	python3 $< ${data_size} ${trace_size} > $@
+	python3 $< ${device} ${data_size} ${trace_size} > $@
 
 #build/insts_${data_size}.txt: build/final_${data_size}.xclbin
 build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index 1d367e5aab..0617dafdad 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -16,7 +16,7 @@
 import aie.utils.trace as trace_utils
 
 
-def my_vector_scalar(vector_size, trace_size):
+def my_vector_scalar(dev, vector_size, trace_size):
     N = vector_size
     N_in_bytes = N * 2
     N_div_n = 4  # chop input vector into 4 sub-vectors
@@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size):
 
     vectorized = True
 
-    @device(AIEDevice.npu1_1col)
+    @device(dev)
     def device_body():
         tensor_ty = np.ndarray[(N,), np.dtype[np.int16]]
         tile_ty = np.ndarray[(n,), np.dtype[np.int16]]
@@ -93,13 +93,20 @@ def sequence(A, F, C):
 
 
 try:
-    vector_size = int(sys.argv[1])
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    vector_size = int(sys.argv[2])
     if vector_size % 64 != 0 or vector_size < 512:
         print("Vector size must be a multiple of 64 and greater than or equal to 512")
         raise ValueError
-    trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
+    trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    my_vector_scalar(vector_size, trace_size)
-    print(ctx.module)
+    my_vector_scalar(dev, vector_size, trace_size)
+print(ctx.module)

From 28c52f0e665c0a8b84c5bbad2bb7e82db144617d Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Mon, 2 Dec 2024 16:21:49 -0700
Subject: [PATCH 15/35] Update aie_kernels/aie2/mul.cc

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 aie_kernels/aie2/mul.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc
index 0c4290034c..a028861c20 100755
--- a/aie_kernels/aie2/mul.cc
+++ b/aie_kernels/aie2/mul.cc
@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-//#define __AIENGINE__ 2
-//#define NOCPP
-//#define __AIEARCH__ 20
+// #define __AIENGINE__ 2
+// #define NOCPP
+// #define __AIEARCH__ 20
 
 #include <stdint.h>
 #include <stdio.h>

From 797739387f2580c38f40ff557de8c00e61062f0f Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 11:09:13 -0700
Subject: [PATCH 16/35] Update alt files

---
 .../basic/passthrough_kernel/aie2_alt.py      | 17 ++++++++++++-----
 .../basic/vector_scalar_mul/aie2_alt.py       | 19 +++++++++++++------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py
index f41965455c..b2835ff997 100644
--- a/programming_examples/basic/passthrough_kernel/aie2_alt.py
+++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py
@@ -16,11 +16,11 @@
 import aie.utils.trace as trace_utils
 
 
-def passthroughKernel(vector_size, trace_size):
+def passthroughKernel(dev, vector_size, trace_size):
     N = vector_size
     lineWidthInBytes = N // 4  # chop input in 4 sub-tensors
 
-    @device(AIEDevice.npu1_1col)
+    @device(dev)
     def device_body():
         # define types
         vector_ty = np.ndarray[(N,), np.dtype[np.uint8]]
@@ -79,13 +79,20 @@ def sequence(inTensor, outTensor, notUsed):
 
 
 try:
-    vector_size = int(sys.argv[1])
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    vector_size = int(sys.argv[2])
     if vector_size % 64 != 0 or vector_size < 512:
         print("Vector size must be a multiple of 64 and greater than or equal to 512")
         raise ValueError
-    trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
+    trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    passthroughKernel(vector_size, trace_size)
+    passthroughKernel(dev, vector_size, trace_size)
     print(ctx.module)
diff --git a/programming_examples/basic/vector_scalar_mul/aie2_alt.py b/programming_examples/basic/vector_scalar_mul/aie2_alt.py
index 335e966745..6c52fc1c21 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2_alt.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2_alt.py
@@ -16,7 +16,7 @@
 import aie.utils.trace as trace_utils
 
 
-def my_vector_scalar(vector_size, trace_size):
+def my_vector_scalar(dev, vector_size, trace_size):
     N = vector_size
     N_in_bytes = N * 2
     N_div_n = 4  # chop input vector into 4 sub-vectors
@@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size):
 
     vectorized = True
 
-    @device(AIEDevice.npu1_1col)
+    @device(dev)
     def device_body():
         tensor_ty = np.ndarray[(N,), np.dtype[np.int16]]
         tile_ty = np.ndarray[(n,), np.dtype[np.int16]]
@@ -97,13 +97,20 @@ def sequence(A, F, C):
 
 
 try:
-    vector_size = int(sys.argv[1])
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    vector_size = int(sys.argv[2])
     if vector_size % 64 != 0 or vector_size < 512:
         print("Vector size must be a multiple of 64 and greater than or equal to 512")
         raise ValueError
-    trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
+    trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3])
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    my_vector_scalar(vector_size, trace_size)
-    print(ctx.module)
+    my_vector_scalar(dev, vector_size, trace_size)
+print(ctx.module)

From 9caaef3d58c0cef4d1f6b27591f4a7987af03e6e Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 11:11:03 -0700
Subject: [PATCH 17/35] Try with words

---
 .../basic/passthrough_kernel/run_strix_makefile.lit         | 2 +-
 programming_examples/lit.cfg.py                             | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index fc1d046b74..91437d212e 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -7,4 +7,4 @@
 // RUN: cd test_stx
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
-// RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s
+// RUN: %run_on_npu_two make -f %S/Makefile run device=npu2 | FileCheck %s
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index efa77009a7..b796fff94c 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -46,7 +46,7 @@
 llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python"))
 
 run_on_npu = "echo"
-run_on_npu2 = "echo"
+run_on_npu_two = "echo"
 xrt_flags = ""
 
 # Not using run_on_board anymore, need more specific per-platform commands
@@ -146,7 +146,7 @@
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
             if str(m.group(3)) == "npu4":
-                run_on_npu2 = (
+                run_on_npu_two = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
             break
@@ -157,7 +157,7 @@
     print("xrt not found")
 
 config.substitutions.append(("%run_on_npu", run_on_npu))
-config.substitutions.append(("%run_on_npu2", run_on_npu2))
+config.substitutions.append(("%run_on_npu_two", run_on_npu_two))
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1"

From 8a37de38a08c8d03b39439ea14079d797a8002e1 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 11:22:18 -0700
Subject: [PATCH 18/35] Try reordering

---
 .../basic/passthrough_kernel/run_strix_makefile.lit         | 2 +-
 programming_examples/lit.cfg.py                             | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index 91437d212e..6b1aef3a70 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -7,4 +7,4 @@
 // RUN: cd test_stx
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
-// RUN: %run_on_npu_two make -f %S/Makefile run device=npu2 | FileCheck %s
+// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 | FileCheck %s
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index b796fff94c..28a782d51c 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -46,7 +46,7 @@
 llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python"))
 
 run_on_npu = "echo"
-run_on_npu_two = "echo"
+run_on_2npu = "echo"
 xrt_flags = ""
 
 # Not using run_on_board anymore, need more specific per-platform commands
@@ -146,7 +146,7 @@
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
             if str(m.group(3)) == "npu4":
-                run_on_npu_two = (
+                run_on_2npu = (
                     f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
             break
@@ -157,7 +157,7 @@
     print("xrt not found")
 
 config.substitutions.append(("%run_on_npu", run_on_npu))
-config.substitutions.append(("%run_on_npu_two", run_on_npu_two))
+config.substitutions.append(("%run_on_2npu", run_on_2npu))
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1"

From 9d321487447f9a07c79759783b41c8e68b170a10 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 14:20:15 -0700
Subject: [PATCH 19/35] Remove FileCheck and use return

---
 .../basic/passthrough_kernel/run_makefile.lit               | 6 ++----
 .../basic/passthrough_kernel/run_makefile_alt.lit           | 3 +--
 .../basic/passthrough_kernel/run_strix_makefile.lit         | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/run_makefile.lit b/programming_examples/basic/passthrough_kernel/run_makefile.lit
index 4619c215e6..4c5bc14c4f 100644
--- a/programming_examples/basic/passthrough_kernel/run_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_makefile.lit
@@ -7,7 +7,5 @@
 // RUN: cd test
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
-// CHECK: Running...
-// CHECK: PASS!
+// RUN: %run_on_npu make -f %S/Makefile run
+// RUN: %run_on_npu make -f %S/Makefile run_py
diff --git a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit
index 4a4a70e117..c37843fa25 100644
--- a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit
+++ b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit
@@ -7,6 +7,5 @@
 // RUN: cd test_alt
 // RUN: make -f %S/Makefile clean
 // RUN: env use_alt=1 make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
-// CHECK: PASS!
+// RUN: %run_on_npu make -f %S/Makefile run_py
   
\ No newline at end of file
diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index 6b1aef3a70..0901bb542f 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -7,4 +7,4 @@
 // RUN: cd test_stx
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile device=npu2 
-// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 | FileCheck %s
+// RUN: %run_on_2npu make -f %S/Makefile run device=npu2

From 05fa89853babd619651396c8029f78b430fe506a Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 14:37:16 -0700
Subject: [PATCH 20/35] [TEST] break the test

---
 programming_examples/basic/passthrough_kernel/aie2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index b225295898..ecdfabaab9 100755
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -56,7 +56,7 @@ def core_body():
                 of_out.release(ObjectFifoPort.Produce, 1)
 
         #    print(ctx.module.operation.verify())
-
+        N2 = N // 2
         @runtime_sequence(vector_ty, vector_ty, vector_ty)
         def sequence(inTensor, outTensor, notUsed):
             if trace_size > 0:
@@ -72,14 +72,14 @@ def sequence(inTensor, outTensor, notUsed):
                 metadata=of_in,
                 bd_id=0,
                 mem=inTensor,
-                sizes=[1, 1, 1, N],
+                sizes=[1, 1, 1, N2],
                 issue_token=True,
             )
             npu_dma_memcpy_nd(
                 metadata=of_out,
                 bd_id=1,
                 mem=outTensor,
-                sizes=[1, 1, 1, N],
+                sizes=[1, 1, 1, N2],
             )
             dma_wait(of_in, of_out)
 

From 3730e7893dd09080a87ff88389e4c32429c012f4 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 14:48:17 -0700
Subject: [PATCH 21/35] test.py return fix

---
 programming_examples/basic/passthrough_kernel/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index f93ddb3ac7..629a1097c1 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -44,7 +44,7 @@ def main(opts):
     else:
         print("\nError count: ", errors)
         print("\nFailed.\n")
-        exit(-1)
+        exit(1)
 
 
 if __name__ == "__main__":

From 0f4530750cf030519cd6ea20689867c4250440d7 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 15:10:10 -0700
Subject: [PATCH 22/35] test.py return fix again

---
 programming_examples/basic/passthrough_kernel/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index 629a1097c1..1e06554a0e 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -40,11 +40,11 @@ def main(opts):
 
     if not errors:
         print("\nPASS!\n")
-        exit(0)
+        sys.exit(0)
     else:
         print("\nError count: ", errors)
         print("\nFailed.\n")
-        exit(1)
+        sys.exit(-1)
 
 
 if __name__ == "__main__":

From fb36343e34f7e18942db2bbe7355db663045bc78 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Tue, 3 Dec 2024 16:46:49 -0700
Subject: [PATCH 23/35] test.py return 1

---
 programming_examples/basic/passthrough_kernel/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index 1e06554a0e..0987344b6a 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -44,7 +44,7 @@ def main(opts):
     else:
         print("\nError count: ", errors)
         print("\nFailed.\n")
-        sys.exit(-1)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

From 8464fde50cb73f10a7db3448ef9e2ee3bc52d77b Mon Sep 17 00:00:00 2001
From: Joseph Melber <Joseph.melber@amd.com>
Date: Tue, 3 Dec 2024 22:21:27 -0700
Subject: [PATCH 24/35] Force fail test.py

---
 programming_examples/basic/passthrough_kernel/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index 0987344b6a..f00623fbbf 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -38,6 +38,7 @@ def main(opts):
         e = np.equal(input, aie_output)
         errors = np.size(e) - np.count_nonzero(e)
 
+    errors = 1
     if not errors:
         print("\nPASS!\n")
         sys.exit(0)

From 007882c684381431830109d12abcea40ba23a494 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 09:50:50 -0700
Subject: [PATCH 25/35] Test for CI

---
 programming_examples/basic/passthrough_kernel/test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index f00623fbbf..0987344b6a 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -38,7 +38,6 @@ def main(opts):
         e = np.equal(input, aie_output)
         errors = np.size(e) - np.count_nonzero(e)
 
-    errors = 1
     if not errors:
         print("\nPASS!\n")
         sys.exit(0)

From 8459aef569bcaf23039b2aa26dbe4a37c564d2e4 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 10:04:02 -0700
Subject: [PATCH 26/35] Break alt

---
 programming_examples/basic/passthrough_kernel/aie2_alt.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py
index b2835ff997..cd54b499b4 100644
--- a/programming_examples/basic/passthrough_kernel/aie2_alt.py
+++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py
@@ -57,6 +57,7 @@ def core_body():
 
         #    print(ctx.module.operation.verify())
 
+        N2 = N // 2
         @runtime_sequence(vector_ty, vector_ty, vector_ty)
         def sequence(inTensor, outTensor, notUsed):
             if trace_size > 0:
@@ -68,10 +69,10 @@ def sequence(inTensor, outTensor, notUsed):
                     offset=N,
                 )
             in_task = shim_dma_single_bd_task(
-                of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True
+                of_in, inTensor, sizes=[1, 1, 1, N2], issue_token=True
             )
             out_task = shim_dma_single_bd_task(
-                of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True
+                of_out, outTensor, sizes=[1, 1, 1, N2], issue_token=True
             )
 
             dma_start_task(in_task, out_task)

From 08ac2d07bc526e63a39fa123f2e191594b061366 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 10:22:05 -0700
Subject: [PATCH 27/35] Cleanup

---
 aie_kernels/aie2/mul.cc                                   | 4 ----
 programming_examples/basic/passthrough_kernel/aie2.py     | 6 ++----
 programming_examples/basic/passthrough_kernel/aie2_alt.py | 5 ++---
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc
index a028861c20..c5ed109332 100755
--- a/aie_kernels/aie2/mul.cc
+++ b/aie_kernels/aie2/mul.cc
@@ -8,10 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// #define __AIENGINE__ 2
-// #define NOCPP
-// #define __AIEARCH__ 20
-
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index ecdfabaab9..ff03ab0bd8 100755
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -55,8 +55,6 @@ def core_body():
                 of_in.release(ObjectFifoPort.Consume, 1)
                 of_out.release(ObjectFifoPort.Produce, 1)
 
-        #    print(ctx.module.operation.verify())
-        N2 = N // 2
         @runtime_sequence(vector_ty, vector_ty, vector_ty)
         def sequence(inTensor, outTensor, notUsed):
             if trace_size > 0:
@@ -72,14 +70,14 @@ def sequence(inTensor, outTensor, notUsed):
                 metadata=of_in,
                 bd_id=0,
                 mem=inTensor,
-                sizes=[1, 1, 1, N2],
+                sizes=[1, 1, 1, N],
                 issue_token=True,
             )
             npu_dma_memcpy_nd(
                 metadata=of_out,
                 bd_id=1,
                 mem=outTensor,
-                sizes=[1, 1, 1, N2],
+                sizes=[1, 1, 1, N],
             )
             dma_wait(of_in, of_out)
 
diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py
index cd54b499b4..b2835ff997 100644
--- a/programming_examples/basic/passthrough_kernel/aie2_alt.py
+++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py
@@ -57,7 +57,6 @@ def core_body():
 
         #    print(ctx.module.operation.verify())
 
-        N2 = N // 2
         @runtime_sequence(vector_ty, vector_ty, vector_ty)
         def sequence(inTensor, outTensor, notUsed):
             if trace_size > 0:
@@ -69,10 +68,10 @@ def sequence(inTensor, outTensor, notUsed):
                     offset=N,
                 )
             in_task = shim_dma_single_bd_task(
-                of_in, inTensor, sizes=[1, 1, 1, N2], issue_token=True
+                of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True
             )
             out_task = shim_dma_single_bd_task(
-                of_out, outTensor, sizes=[1, 1, 1, N2], issue_token=True
+                of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True
             )
 
             dma_start_task(in_task, out_task)

From 7434ab95b6f0429834005f53da151a2d66247c94 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 10:22:44 -0700
Subject: [PATCH 28/35] vector_scalar_mul stx

---
 .../basic/vector_scalar_mul/run_strix_makefile.lit     | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100755 programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit

diff --git a/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit
new file mode 100755
index 0000000000..0901bb542f
--- /dev/null
+++ b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: mkdir -p test_stx
+// RUN: cd test_stx
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile device=npu2 
+// RUN: %run_on_2npu make -f %S/Makefile run device=npu2

From 9da29ee460beda717924d15b994a3bb01513d546 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 10:26:54 -0700
Subject: [PATCH 29/35] Use returns for lit checks

---
 .../basic/vector_scalar_mul/run_makefile.lit             | 9 ++++-----
 .../basic/vector_scalar_mul/run_makefile_alt.lit         | 5 ++---
 .../basic/vector_scalar_mul/run_makefile_chess.lit       | 9 ++++-----
 programming_examples/basic/vector_scalar_mul/test.py     | 4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_makefile.lit
index d298884111..244bff012a 100644
--- a/programming_examples/basic/vector_scalar_mul/run_makefile.lit
+++ b/programming_examples/basic/vector_scalar_mul/run_makefile.lit
@@ -7,9 +7,8 @@
 // RUN: cd test_peano
 // RUN: make -f %S/Makefile clean
 // RUN: env CHESS=false make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
+// RUN: %run_on_npu make -f %S/Makefile run 
+// RUN: %run_on_npu make -f %S/Makefile run_py 
 // RUN: make -f %S/Makefile clean
-// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace | FileCheck %s
-// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
-// CHECK: PASS!
+// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace 
+// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py 
diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit
index edfe402ec1..f0b5578ffa 100644
--- a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit
+++ b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit
@@ -7,6 +7,5 @@
 // RUN: cd test_alt
 // RUN: make -f %S/Makefile clean
 // RUN: env CHESS=true use_alt=1 make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// CHECK: PASS!
-  
\ No newline at end of file
+// RUN: %run_on_npu make -f %S/Makefile run 
+  
diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit
index 481b220165..da7102bcfc 100644
--- a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit
+++ b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit
@@ -7,9 +7,8 @@
 // RUN: cd test_chess
 // RUN: make -f %S/Makefile clean
 // RUN: env CHESS=true make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s
+// RUN: %run_on_npu make -f %S/Makefile run
+// RUN: %run_on_npu make -f %S/Makefile run_py
 // RUN: make -f %S/Makefile clean
-// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace | FileCheck %s
-// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py | FileCheck %s
-// CHECK: PASS!
+// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace
+// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py
diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py
index 9ffa7dab1d..c91b53307f 100644
--- a/programming_examples/basic/vector_scalar_mul/test.py
+++ b/programming_examples/basic/vector_scalar_mul/test.py
@@ -71,11 +71,11 @@ def main(opts):
 
     if not errors:
         print("\nPASS!\n")
-        exit(0)
+        sys.exit(0)
     else:
         print("\nError count: ", errors)
         print("\nFailed.\n")
-        exit(-1)
+        sys.exit(1)
 
 
 if __name__ == "__main__":

From ff3f4844856437e12ddd606569e0f7f26ccf7444 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 11:35:23 -0700
Subject: [PATCH 30/35] Add some vision examples

---
 .../vision/color_threshold/Makefile           |  14 +-
 .../color_threshold/aie2_colorThreshold.py    | 496 ++++++++---------
 .../color_threshold/run_strix_makefile.lit    |  10 +
 .../vision/edge_detect/Makefile               |  14 +-
 .../vision/edge_detect/aie2_edgeDetect.py     | 513 +++++++++---------
 .../vision/edge_detect/run_strix_makefile.lit |  10 +
 6 files changed, 554 insertions(+), 503 deletions(-)
 create mode 100755 programming_examples/vision/color_threshold/run_strix_makefile.lit
 create mode 100755 programming_examples/vision/edge_detect/run_strix_makefile.lit

diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile
index a3dfaa8646..84b2d710f3 100644
--- a/programming_examples/vision/color_threshold/Makefile
+++ b/programming_examples/vision/color_threshold/Makefile
@@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common
 
 VPATH := ${srcdir}/../../../aie_kernels/aie2
 
+device = npu
 COLORTHRESHOLD_WIDTH  = 1920
 COLORTHRESHOLD_HEIGHT = 1080
 
@@ -33,17 +34,28 @@ mlir: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir
 
 build/%.cc.o: %.cc
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else ifeq ($(device),npu2)
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else
+	echo "Device type not supported"
+endif
 	
 build/aie2_${COLORTHRESHOLD_WIDTH}.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
-	python3 $< ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@
+	python3 $< ${device} ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@
 
 build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/threshold.cc.o
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \
 		--no-xchesscc --no-xbridge \
 		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
+else
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
+endif
 
 ${targetname}.exe: ${srcdir}/test.cpp
 	rm -rf _build
diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index cbb5c2e631..c8eb7e8657 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -14,258 +14,262 @@
 from aie.helpers.util import np_ndarray_type_get_shape
 from aie.helpers.dialects.ext.scf import _for as range_
 
-width = 512
-height = 9
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
 
-lineWidth = width
-lineWidthChannels = width * 4  # 4 channels
-tensorSize = width * height
-
-enableTrace = False
-traceSizeInBytes = 8192
-traceSizeInInt32s = traceSizeInBytes // 4
-
-
-def color_threshold():
-    with mlir_mod_ctx() as ctx:
-
-        @device(AIEDevice.npu1_1col)
-        def device_body():
-            line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]]
-            line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
-
-            # AIE Core Function declarations
-            thresholdLine = external_func(
-                "thresholdLine",
-                inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
-            )
-
-            # Tile declarations
-            ShimTile = tile(0, 0)
-            MemTile = tile(0, 1)
-            ComputeTile2 = tile(0, 2)
-            ComputeTile3 = tile(0, 3)
-            ComputeTile4 = tile(0, 4)
-            ComputeTile5 = tile(0, 5)
+def color_threshold(dev, width, height):
+    lineWidth = width
+    lineWidthChannels = width * 4  # 4 channels
+    tensorSize = width * height
+
+    enableTrace = False
+    traceSizeInBytes = 8192
+    traceSizeInInt32s = traceSizeInBytes // 4
+
+    @device(dev)
+    def device_body():
+        line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]]
+        line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
+
+        # AIE Core Function declarations
+        thresholdLine = external_func(
+            "thresholdLine",
+            inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+        )
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        ComputeTile2 = tile(0, 2)
+        ComputeTile3 = tile(0, 3)
+        ComputeTile4 = tile(0, 4)
+        ComputeTile5 = tile(0, 5)
+
+        # AIE-array data movement with object fifos
+
+        # Input RGBA broadcast + memtile for skip
+        inOOB_L3L2 = object_fifo(
+            "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
+        )
+        inOOB_L2L1_0 = object_fifo(
+            "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
+        )
+        inOOB_L2L1_1 = object_fifo(
+            "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
+        )
+        inOOB_L2L1_2 = object_fifo(
+            "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
+        )
+        inOOB_L2L1_3 = object_fifo(
+            "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
+        )
+        of_offsets = [
+            np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
+        ]
+        object_fifo_link(
+            inOOB_L3L2,
+            [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
+            [],
+            of_offsets,
+        )
+
+        # Output RGBA
+        outOOB_L2L3 = object_fifo(
+            "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
+        )
+        outOOB_L1L2_0 = object_fifo(
+            "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_1 = object_fifo(
+            "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_2 = object_fifo(
+            "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_3 = object_fifo(
+            "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
+        )
+        object_fifo_link(
+            [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
+            outOOB_L2L3,
+            of_offsets,
+            [],
+        )
+
+        # Runtime parameters
+        rtpComputeTile2 = buffer(
+            ComputeTile2,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile2",
+            use_write_rtp=True,
+        )
+        rtpComputeTile3 = buffer(
+            ComputeTile3,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile3",
+            use_write_rtp=True,
+        )
+        rtpComputeTile4 = buffer(
+            ComputeTile4,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile4",
+            use_write_rtp=True,
+        )
+        rtpComputeTile5 = buffer(
+            ComputeTile5,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile5",
+            use_write_rtp=True,
+        )
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile2[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # AIE-array data movement with object fifos
+                inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 3
+        @core(ComputeTile3, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1)
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile3[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Input RGBA broadcast + memtile for skip
-            inOOB_L3L2 = object_fifo(
-                "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
-            )
-            inOOB_L2L1_0 = object_fifo(
-                "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
-            )
-            inOOB_L2L1_1 = object_fifo(
-                "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
-            )
-            inOOB_L2L1_2 = object_fifo(
-                "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
-            )
-            inOOB_L2L1_3 = object_fifo(
-                "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
-            )
-            of_offsets = [
-                np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
-            ]
-            object_fifo_link(
-                inOOB_L3L2,
-                [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
-                [],
-                of_offsets,
-            )
+                inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 4
+        @core(ComputeTile4, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile4[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Output RGBA
-            outOOB_L2L3 = object_fifo(
-                "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
-            )
-            outOOB_L1L2_0 = object_fifo(
-                "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
-            )
-            outOOB_L1L2_1 = object_fifo(
-                "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
-            )
-            outOOB_L1L2_2 = object_fifo(
-                "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
-            )
-            outOOB_L1L2_3 = object_fifo(
-                "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
-            )
-            object_fifo_link(
-                [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
-                outOOB_L2L3,
-                of_offsets,
-                [],
-            )
+                inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 5
+        @core(ComputeTile5, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile5[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Runtime parameters
-            rtpComputeTile2 = buffer(
-                ComputeTile2,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile2",
-                use_write_rtp=True,
-            )
-            rtpComputeTile3 = buffer(
-                ComputeTile3,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile3",
-                use_write_rtp=True,
+                inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(
+            np.ndarray[(tensorSize,), np.dtype[np.int8]],
+            np.ndarray[(32,), np.dtype[np.int32]],  # not used
+            np.ndarray[(tensorSize,), np.dtype[np.int8]],
+        )
+        def sequence(inTensor, notUsed, outTensor):
+            # thresholdValue, maxValue, thresholdType
+            rtpComputeTile2[0] = 50
+            rtpComputeTile2[1] = 255
+            rtpComputeTile2[2] = 0
+
+            rtpComputeTile3[0] = 50
+            rtpComputeTile3[1] = 255
+            rtpComputeTile3[2] = 0
+
+            rtpComputeTile4[0] = 50
+            rtpComputeTile4[1] = 255
+            rtpComputeTile4[2] = 0
+
+            rtpComputeTile5[0] = 50
+            rtpComputeTile5[1] = 255
+            rtpComputeTile5[2] = 0
+
+            npu_dma_memcpy_nd(
+                metadata=inOOB_L3L2,
+                bd_id=1,
+                mem=inTensor,
+                sizes=[1, 1, 1, tensorSize],
+                issue_token=True,
             )
-            rtpComputeTile4 = buffer(
-                ComputeTile4,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile4",
-                use_write_rtp=True,
+            npu_dma_memcpy_nd(
+                metadata=outOOB_L2L3,
+                bd_id=0,
+                mem=outTensor,
+                sizes=[1, 1, 1, tensorSize],
             )
-            rtpComputeTile5 = buffer(
-                ComputeTile5,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile5",
-                use_write_rtp=True,
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile2[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 3
-            @core(ComputeTile3, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1)
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile3[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 4
-            @core(ComputeTile4, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile4[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 5
-            @core(ComputeTile5, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile5[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            @runtime_sequence(
-                np.ndarray[(tensorSize,), np.dtype[np.int8]],
-                np.ndarray[(32,), np.dtype[np.int32]],  # not used
-                np.ndarray[(tensorSize,), np.dtype[np.int8]],
-            )
-            def sequence(inTensor, notUsed, outTensor):
-                # thresholdValue, maxValue, thresholdType
-                rtpComputeTile2[0] = 50
-                rtpComputeTile2[1] = 255
-                rtpComputeTile2[2] = 0
-
-                rtpComputeTile3[0] = 50
-                rtpComputeTile3[1] = 255
-                rtpComputeTile3[2] = 0
-
-                rtpComputeTile4[0] = 50
-                rtpComputeTile4[1] = 255
-                rtpComputeTile4[2] = 0
-
-                rtpComputeTile5[0] = 50
-                rtpComputeTile5[1] = 255
-                rtpComputeTile5[2] = 0
-
-                npu_dma_memcpy_nd(
-                    metadata=inOOB_L3L2,
-                    bd_id=1,
-                    mem=inTensor,
-                    sizes=[1, 1, 1, tensorSize],
-                    issue_token=True,
-                )
-                npu_dma_memcpy_nd(
-                    metadata=outOOB_L2L3,
-                    bd_id=0,
-                    mem=outTensor,
-                    sizes=[1, 1, 1, tensorSize],
-                )
-                dma_wait(inOOB_L3L2, outOOB_L2L3)
-
-    # print(ctx.module.operation.verify())
+            dma_wait(inOOB_L3L2, outOOB_L2L3)
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = 512 if (len(sys.argv) != 4) else int(sys.argv[2])
+    height = 9 if (len(sys.argv) != 4) else int(sys.argv[3])
+except ValueError:
+    print("Argument has inappropriate value")
+with mlir_mod_ctx() as ctx:
+    #print(ctx.module.operation.verify())
+    color_threshold(dev, width, height)
     print(ctx.module)
 
-
-color_threshold()
diff --git a/programming_examples/vision/color_threshold/run_strix_makefile.lit b/programming_examples/vision/color_threshold/run_strix_makefile.lit
new file mode 100755
index 0000000000..0901bb542f
--- /dev/null
+++ b/programming_examples/vision/color_threshold/run_strix_makefile.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: mkdir -p test_stx
+// RUN: cd test_stx
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile device=npu2 
+// RUN: %run_on_2npu make -f %S/Makefile run device=npu2
diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile
index e1ed21e0ae..2f6159bd3d 100755
--- a/programming_examples/vision/edge_detect/Makefile
+++ b/programming_examples/vision/edge_detect/Makefile
@@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common
 
 VPATH := ${srcdir}/../../../aie_kernels/aie2
 
+device = npu
 EDGEDETECT_WIDTH = 1920
 EDGEDETECT_HEIGHT = 1080
 
@@ -36,7 +37,13 @@ mlir: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir
 
 build/%.cc.o: %.cc
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 	cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else ifeq ($(device),npu2)
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F}
+else
+	echo "Device type not supported"
+endif
 
 build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o
 	mkdir -p ${@D}
@@ -44,13 +51,18 @@ build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.c
 
 build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
-	python3 $< ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@
+	python3 $< ${device} ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@
 
 build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a
 	mkdir -p ${@D}
+ifeq ($(device),npu)
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \
 		--no-xchesscc --no-xbridge \
 		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
+else
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
+endif
 
 ${targetname}.exe: ${srcdir}/test.cpp
 	rm -rf _build
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index 59d7c030f2..23565c8b47 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -12,278 +12,281 @@
 from aie.helpers.dialects.ext.scf import _for as range_
 from aie.extras.context import mlir_mod_ctx
 
-width = 64
-height = 36
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
 
-heightMinus1 = height - 1
-lineWidth = width
-lineWidthInBytes = width * 4
-tensorSize = width * height * 4  # 4 channels
-
-enableTrace = False
-traceSizeInBytes = 8192
-traceSizeInInt32s = traceSizeInBytes // 4
-
-
-def edge_detect():
-    with mlir_mod_ctx() as ctx:
-
-        @device(AIEDevice.npu1_1col)
-        def device_body():
-            line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]
-            line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
-            tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]]
-
-            tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
-            tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]]
-
-            # AIE Core Function declarations
-            rgba2gray_line = external_func(
-                "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32]
-            )
-            filter2d_line = external_func(
-                "filter2dLine",
-                inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty],
-            )
-            threshold_line = external_func(
-                "thresholdLine",
-                inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
-            )
-            gray2rgba_line = external_func(
-                "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
-            )
-            add_weighted_line = external_func(
-                "addWeightedLine",
-                inputs=[
-                    line_bytes_ty,
-                    line_bytes_ty,
-                    line_bytes_ty,
-                    np.int32,
-                    np.int16,
-                    np.int16,
-                    np.int8,
-                ],
-            )
-
-            # Tile declarations
-            ShimTile = tile(0, 0)
-            MemTile = tile(0, 1)
-            ComputeTile2 = tile(0, 2)
-            ComputeTile3 = tile(0, 3)
-            ComputeTile4 = tile(0, 4)
-            ComputeTile5 = tile(0, 5)
-
-            # AIE-array data movement with object fifos
-            # Input
-            inOF_L3L2 = object_fifo(
-                "inOF_L3L2",
-                ShimTile,
-                [ComputeTile2, MemTile],
-                [2, 2, 7],
+def edge_detect(dev, width, height):
+    heightMinus1 = height - 1
+    lineWidth = width
+    lineWidthInBytes = width * 4
+    tensorSize = width * height * 4  # 4 channels
+
+    enableTrace = False
+    traceSizeInBytes = 8192
+    traceSizeInInt32s = traceSizeInBytes // 4
+
+    @device(dev)
+    def device_body():
+        line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]
+        line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
+        tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]]
+
+        tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
+        tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]]
+
+        # AIE Core Function declarations
+        rgba2gray_line = external_func(
+            "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32]
+        )
+        filter2d_line = external_func(
+            "filter2dLine",
+            inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty],
+        )
+        threshold_line = external_func(
+            "thresholdLine",
+            inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+        )
+        gray2rgba_line = external_func(
+            "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
+        )
+        add_weighted_line = external_func(
+            "addWeightedLine",
+            inputs=[
                 line_bytes_ty,
-            )
-            inOF_L2L1 = object_fifo(
-                "inOF_L2L1",
-                MemTile,
-                ComputeTile5,
-                7,
-                line_bytes_ty,
-            )
-            object_fifo_link(inOF_L3L2, inOF_L2L1)
-
-            # Output
-            outOF_L2L3 = object_fifo(
-                "outOF_L2L3",
-                MemTile,
-                ShimTile,
-                2,
                 line_bytes_ty,
-            )
-            outOF_L1L2 = object_fifo(
-                "outOF_L1L2",
-                ComputeTile5,
-                MemTile,
-                2,
                 line_bytes_ty,
+                np.int32,
+                np.int16,
+                np.int16,
+                np.int8,
+            ],
+        )
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        ComputeTile2 = tile(0, 2)
+        ComputeTile3 = tile(0, 3)
+        ComputeTile4 = tile(0, 4)
+        ComputeTile5 = tile(0, 5)
+
+        # AIE-array data movement with object fifos
+        # Input
+        inOF_L3L2 = object_fifo(
+            "inOF_L3L2",
+            ShimTile,
+            [ComputeTile2, MemTile],
+            [2, 2, 7],
+            line_bytes_ty,
+        )
+        inOF_L2L1 = object_fifo(
+            "inOF_L2L1",
+            MemTile,
+            ComputeTile5,
+            7,
+            line_bytes_ty,
+        )
+        object_fifo_link(inOF_L3L2, inOF_L2L1)
+
+        # Output
+        outOF_L2L3 = object_fifo(
+            "outOF_L2L3",
+            MemTile,
+            ShimTile,
+            2,
+            line_bytes_ty,
+        )
+        outOF_L1L2 = object_fifo(
+            "outOF_L1L2",
+            ComputeTile5,
+            MemTile,
+            2,
+            line_bytes_ty,
+        )
+        object_fifo_link(outOF_L1L2, outOF_L2L3)
+
+        # Intermediate
+        OF_2to3 = object_fifo(
+            "OF_2to3",
+            ComputeTile2,
+            ComputeTile3,
+            4,
+            line_ty,
+        )
+        OF_3to4 = object_fifo(
+            "OF_3to4",
+            ComputeTile3,
+            ComputeTile4,
+            2,
+            line_ty,
+        )
+        OF_4to5 = object_fifo(
+            "OF_4to5",
+            ComputeTile4,
+            ComputeTile5,
+            2,
+            line_ty,
+        )
+        OF_5to5 = object_fifo(
+            "OF_5to5",
+            ComputeTile5,
+            ComputeTile5,
+            1,
+            line_bytes_ty,
+        )
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2, "rgba2gray.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1)
+
+                rgba2gray_line(elem_in, elem_out, lineWidth)
+
+                inOF_L3L2.release(ObjectFifoPort.Consume, 1)
+                OF_2to3.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 3
+        @core(ComputeTile3, "filter2d.cc.o")
+        def core_body():
+            v0 = 0
+            v1 = 4096
+            v_minus4 = -16384
+            initial_value = np.array(
+                [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16
             )
-            object_fifo_link(outOF_L1L2, outOF_L2L3)
-
-            # Intermediate
-            OF_2to3 = object_fifo(
-                "OF_2to3",
-                ComputeTile2,
+            kernel = buffer(
                 ComputeTile3,
-                4,
-                line_ty,
-            )
-            OF_3to4 = object_fifo(
-                "OF_3to4",
-                ComputeTile3,
-                ComputeTile4,
-                2,
-                line_ty,
-            )
-            OF_4to5 = object_fifo(
-                "OF_4to5",
-                ComputeTile4,
-                ComputeTile5,
-                2,
-                line_ty,
+                np.ndarray[(3, 3), np.dtype[np.int16]],
+                "kernel",
+                initial_value=initial_value,
             )
-            OF_5to5 = object_fifo(
-                "OF_5to5",
-                ComputeTile5,
-                ComputeTile5,
-                1,
-                line_bytes_ty,
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2, "rgba2gray.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1)
-
-                    rgba2gray_line(elem_in, elem_out, lineWidth)
 
-                    inOF_L3L2.release(ObjectFifoPort.Consume, 1)
-                    OF_2to3.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 3
-            @core(ComputeTile3, "filter2d.cc.o")
-            def core_body():
-                v0 = 0
-                v1 = 4096
-                v_minus4 = -16384
-                initial_value = np.array(
-                    [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16
-                )
-                kernel = buffer(
-                    ComputeTile3,
-                    np.ndarray[(3, 3), np.dtype[np.int16]],
-                    "kernel",
-                    initial_value=initial_value,
+            for _ in range_(sys.maxsize):
+                # Preamble : Top Border
+                elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
+                elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                filter2d_line(
+                    elems_in_pre[0],
+                    elems_in_pre[0],
+                    elems_in_pre[1],
+                    elem_pre_out,
+                    lineWidth,
+                    kernel,
                 )
+                OF_3to4.release(ObjectFifoPort.Produce, 1)
 
-                for _ in range_(sys.maxsize):
-                    # Preamble : Top Border
-                    elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
-                    elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                # Steady State : Middle
+                for _ in range_(1, heightMinus1):
+                    elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3)
+                    elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
                     filter2d_line(
-                        elems_in_pre[0],
-                        elems_in_pre[0],
-                        elems_in_pre[1],
-                        elem_pre_out,
+                        elems_in[0],
+                        elems_in[1],
+                        elems_in[2],
+                        elem_out,
                         lineWidth,
                         kernel,
                     )
+                    OF_2to3.release(ObjectFifoPort.Consume, 1)
                     OF_3to4.release(ObjectFifoPort.Produce, 1)
 
-                    # Steady State : Middle
-                    for _ in range_(1, heightMinus1):
-                        elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3)
-                        elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
-                        filter2d_line(
-                            elems_in[0],
-                            elems_in[1],
-                            elems_in[2],
-                            elem_out,
-                            lineWidth,
-                            kernel,
-                        )
-                        OF_2to3.release(ObjectFifoPort.Consume, 1)
-                        OF_3to4.release(ObjectFifoPort.Produce, 1)
-
-                    # Postamble : Bottom Border
-                    elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
-                    elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
-                    filter2d_line(
-                        elems_in_post[0],
-                        elems_in_post[1],
-                        elems_in_post[1],
-                        elem_post_out,
-                        lineWidth,
-                        kernel,
-                    )
-                    OF_2to3.release(ObjectFifoPort.Consume, 2)
-                    OF_3to4.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 4
-            @core(ComputeTile4, "threshold.cc.o")
-            def core_body():
-                v_thr = 10
-                v_max = 255
-                v_typ = 0
-
-                for _ in range_(sys.maxsize):
-                    elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1)
-
-                    threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ)
-
-                    OF_3to4.release(ObjectFifoPort.Consume, 1)
-                    OF_4to5.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 5
-            @core(ComputeTile5, "combined_gray2rgba_addWeighted.a")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1)
-
-                    gray2rgba_line(elem_in, elem_out, lineWidth)
-
-                    OF_4to5.release(ObjectFifoPort.Consume, 1)
-                    OF_5to5.release(ObjectFifoPort.Produce, 1)
-
-                    elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1)
-                    elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1)
-
-                    alpha = 16384
-                    beta = 16384
-                    gamma = 0
-
-                    add_weighted_line(
-                        elem_in1,
-                        elem_in2,
-                        elem_out2,
-                        lineWidthInBytes,
-                        alpha,
-                        beta,
-                        gamma,
-                    )
-
-                    OF_5to5.release(ObjectFifoPort.Consume, 1)
-                    inOF_L2L1.release(ObjectFifoPort.Consume, 1)
-                    outOF_L1L2.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty)
-            def sequence(I, B, O):
-                npu_dma_memcpy_nd(
-                    metadata=inOF_L3L2,
-                    bd_id=1,
-                    mem=I,
-                    sizes=[1, 1, 1, tensorSize],
+                # Postamble : Bottom Border
+                elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
+                elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                filter2d_line(
+                    elems_in_post[0],
+                    elems_in_post[1],
+                    elems_in_post[1],
+                    elem_post_out,
+                    lineWidth,
+                    kernel,
                 )
-                npu_dma_memcpy_nd(
-                    metadata=outOF_L2L3,
-                    bd_id=0,
-                    mem=O,
-                    sizes=[1, 1, 1, tensorSize],
+                OF_2to3.release(ObjectFifoPort.Consume, 2)
+                OF_3to4.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 4
+        @core(ComputeTile4, "threshold.cc.o")
+        def core_body():
+            v_thr = 10
+            v_max = 255
+            v_typ = 0
+
+            for _ in range_(sys.maxsize):
+                elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1)
+
+                threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ)
+
+                OF_3to4.release(ObjectFifoPort.Consume, 1)
+                OF_4to5.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 5
+        @core(ComputeTile5, "combined_gray2rgba_addWeighted.a")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1)
+
+                gray2rgba_line(elem_in, elem_out, lineWidth)
+
+                OF_4to5.release(ObjectFifoPort.Consume, 1)
+                OF_5to5.release(ObjectFifoPort.Produce, 1)
+
+                elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1)
+                elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1)
+                elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1)
+
+                alpha = 16384
+                beta = 16384
+                gamma = 0
+
+                add_weighted_line(
+                    elem_in1,
+                    elem_in2,
+                    elem_out2,
+                    lineWidthInBytes,
+                    alpha,
+                    beta,
+                    gamma,
                 )
-                # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all
-                dma_wait(outOF_L2L3)
 
-    #    print(ctx.module.operation.verify())
+                OF_5to5.release(ObjectFifoPort.Consume, 1)
+                inOF_L2L1.release(ObjectFifoPort.Consume, 1)
+                outOF_L1L2.release(ObjectFifoPort.Produce, 1)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty)
+        def sequence(I, B, O):
+            npu_dma_memcpy_nd(
+                metadata=inOF_L3L2,
+                bd_id=1,
+                mem=I,
+                sizes=[1, 1, 1, tensorSize],
+            )
+            npu_dma_memcpy_nd(
+                metadata=outOF_L2L3,
+                bd_id=0,
+                mem=O,
+                sizes=[1, 1, 1, tensorSize],
+            )
+            # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all
+            dma_wait(outOF_L2L3)
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = 36 if (len(sys.argv) != 4) else int(sys.argv[2])
+    height = 64 if (len(sys.argv) != 4) else int(sys.argv[3])
+except ValueError:
+    print("Argument has inappropriate value")
+with mlir_mod_ctx() as ctx:
+    #print(ctx.module.operation.verify())
+    edge_detect(dev, width, height)
     print(ctx.module)
-
-
-edge_detect()
diff --git a/programming_examples/vision/edge_detect/run_strix_makefile.lit b/programming_examples/vision/edge_detect/run_strix_makefile.lit
new file mode 100755
index 0000000000..0901bb542f
--- /dev/null
+++ b/programming_examples/vision/edge_detect/run_strix_makefile.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: mkdir -p test_stx
+// RUN: cd test_stx
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile device=npu2 
+// RUN: %run_on_2npu make -f %S/Makefile run device=npu2

From 5b5ed073482fa599b1fb7e69fc0f4aa22159b838 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 12:49:34 -0700
Subject: [PATCH 31/35] Update alt to accept device

---
 .../aie2_colorThreshold_alt.py                | 488 ++++++++---------
 .../vision/edge_detect/aie2_edgeDetect_alt.py | 509 +++++++++---------
 2 files changed, 502 insertions(+), 495 deletions(-)

diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
index fd36516f3b..b676b9168e 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
@@ -14,256 +14,260 @@
 from aie.helpers.util import np_ndarray_type_get_shape
 from aie.helpers.dialects.ext.scf import _for as range_
 
-width = 512
-height = 9
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
 
-lineWidth = width
-lineWidthChannels = width * 4  # 4 channels
-tensorSize = width * height
-
-enableTrace = False
-traceSizeInBytes = 8192
-traceSizeInInt32s = traceSizeInBytes // 4
-
-
-def color_threshold():
-    with mlir_mod_ctx() as ctx:
-
-        @device(AIEDevice.npu1_1col)
-        def device_body():
-            line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]]
-            line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
-
-            # AIE Core Function declarations
-            thresholdLine = external_func(
-                "thresholdLine",
-                inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
-            )
+def color_threshold(dev, width, height):
+    lineWidth = width
+    lineWidthChannels = width * 4  # 4 channels
+    tensorSize = width * height
+
+    enableTrace = False
+    traceSizeInBytes = 8192
+    traceSizeInInt32s = traceSizeInBytes // 4
+
+    @device(dev)
+    def device_body():
+        line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]]
+        line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
+
+        # AIE Core Function declarations
+        thresholdLine = external_func(
+            "thresholdLine",
+            inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+        )
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        ComputeTile2 = tile(0, 2)
+        ComputeTile3 = tile(0, 3)
+        ComputeTile4 = tile(0, 4)
+        ComputeTile5 = tile(0, 5)
+
+        # AIE-array data movement with object fifos
+
+        # Input RGBA broadcast + memtile for skip
+        inOOB_L3L2 = object_fifo(
+            "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
+        )
+        inOOB_L2L1_0 = object_fifo(
+            "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
+        )
+        inOOB_L2L1_1 = object_fifo(
+            "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
+        )
+        inOOB_L2L1_2 = object_fifo(
+            "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
+        )
+        inOOB_L2L1_3 = object_fifo(
+            "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
+        )
+        of_offsets = [
+            np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
+        ]
+        object_fifo_link(
+            inOOB_L3L2,
+            [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
+            [],
+            of_offsets,
+        )
+
+        # Output RGBA
+        outOOB_L2L3 = object_fifo(
+            "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
+        )
+        outOOB_L1L2_0 = object_fifo(
+            "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_1 = object_fifo(
+            "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_2 = object_fifo(
+            "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
+        )
+        outOOB_L1L2_3 = object_fifo(
+            "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
+        )
+        object_fifo_link(
+            [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
+            outOOB_L2L3,
+            of_offsets,
+            [],
+        )
+
+        # Runtime parameters
+        rtpComputeTile2 = buffer(
+            ComputeTile2,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile2",
+            use_write_rtp=True,
+        )
+        rtpComputeTile3 = buffer(
+            ComputeTile3,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile3",
+            use_write_rtp=True,
+        )
+        rtpComputeTile4 = buffer(
+            ComputeTile4,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile4",
+            use_write_rtp=True,
+        )
+        rtpComputeTile5 = buffer(
+            ComputeTile5,
+            np.ndarray[(16,), np.dtype[np.int32]],
+            "rtpComputeTile5",
+            use_write_rtp=True,
+        )
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile2[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Tile declarations
-            ShimTile = tile(0, 0)
-            MemTile = tile(0, 1)
-            ComputeTile2 = tile(0, 2)
-            ComputeTile3 = tile(0, 3)
-            ComputeTile4 = tile(0, 4)
-            ComputeTile5 = tile(0, 5)
+                inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 3
+        @core(ComputeTile3, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1)
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile3[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # AIE-array data movement with object fifos
+                inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 4
+        @core(ComputeTile4, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile4[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Input RGBA broadcast + memtile for skip
-            inOOB_L3L2 = object_fifo(
-                "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
-            )
-            inOOB_L2L1_0 = object_fifo(
-                "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
-            )
-            inOOB_L2L1_1 = object_fifo(
-                "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
-            )
-            inOOB_L2L1_2 = object_fifo(
-                "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
-            )
-            inOOB_L2L1_3 = object_fifo(
-                "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
-            )
-            of_offsets = [
-                np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
-            ]
-            object_fifo_link(
-                inOOB_L3L2,
-                [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
-                [],
-                of_offsets,
-            )
+                inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 5
+        @core(ComputeTile5, "threshold.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1)
+                elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1)
+
+                # RTPs written from the instruction stream must be read right before the kernel
+                # after the ObjectFIFO acquires
+                thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0])
+                maxValue = arith.trunci(T.i16(), rtpComputeTile5[1])
+                thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2])
+                thresholdLine(
+                    elemIn,
+                    elemOut,
+                    lineWidth,
+                    thresholdValue,
+                    maxValue,
+                    thresholdType,
+                )
 
-            # Output RGBA
-            outOOB_L2L3 = object_fifo(
-                "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
-            )
-            outOOB_L1L2_0 = object_fifo(
-                "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
-            )
-            outOOB_L1L2_1 = object_fifo(
-                "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
+                inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1)
+                outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1)
+
+        # To/from AIE-array data movement
+        @runtime_sequence(
+            np.ndarray[(tensorSize,), np.dtype[np.int8]],
+            np.ndarray[(32,), np.dtype[np.int32]],  # not used
+            np.ndarray[(tensorSize,), np.dtype[np.int8]],
+        )
+        def sequence(inTensor, notUsed, outTensor):
+            # thresholdValue, maxValue, thresholdType
+            rtpComputeTile2[0] = 50
+            rtpComputeTile2[1] = 255
+            rtpComputeTile2[2] = 0
+
+            rtpComputeTile3[0] = 50
+            rtpComputeTile3[1] = 255
+            rtpComputeTile3[2] = 0
+
+            rtpComputeTile4[0] = 50
+            rtpComputeTile4[1] = 255
+            rtpComputeTile4[2] = 0
+
+            rtpComputeTile5[0] = 50
+            rtpComputeTile5[1] = 255
+            rtpComputeTile5[2] = 0
+
+            in_task = shim_dma_single_bd_task(
+                inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True
             )
-            outOOB_L1L2_2 = object_fifo(
-                "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
-            )
-            outOOB_L1L2_3 = object_fifo(
-                "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
-            )
-            object_fifo_link(
-                [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
+            out_task = shim_dma_single_bd_task(
                 outOOB_L2L3,
-                of_offsets,
-                [],
+                outTensor,
+                sizes=[1, 1, 1, tensorSize],
+                issue_token=True,
             )
 
-            # Runtime parameters
-            rtpComputeTile2 = buffer(
-                ComputeTile2,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile2",
-                use_write_rtp=True,
-            )
-            rtpComputeTile3 = buffer(
-                ComputeTile3,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile3",
-                use_write_rtp=True,
-            )
-            rtpComputeTile4 = buffer(
-                ComputeTile4,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile4",
-                use_write_rtp=True,
-            )
-            rtpComputeTile5 = buffer(
-                ComputeTile5,
-                np.ndarray[(16,), np.dtype[np.int32]],
-                "rtpComputeTile5",
-                use_write_rtp=True,
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile2[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 3
-            @core(ComputeTile3, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1)
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile3[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 4
-            @core(ComputeTile4, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile4[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 5
-            @core(ComputeTile5, "threshold.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1)
-                    elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1)
-
-                    # RTPs written from the instruction stream must be read right before the kernel
-                    # after the ObjectFIFO acquires
-                    thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0])
-                    maxValue = arith.trunci(T.i16(), rtpComputeTile5[1])
-                    thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2])
-                    thresholdLine(
-                        elemIn,
-                        elemOut,
-                        lineWidth,
-                        thresholdValue,
-                        maxValue,
-                        thresholdType,
-                    )
-
-                    inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1)
-                    outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            @runtime_sequence(
-                np.ndarray[(tensorSize,), np.dtype[np.int8]],
-                np.ndarray[(32,), np.dtype[np.int32]],  # not used
-                np.ndarray[(tensorSize,), np.dtype[np.int8]],
-            )
-            def sequence(inTensor, notUsed, outTensor):
-                # thresholdValue, maxValue, thresholdType
-                rtpComputeTile2[0] = 50
-                rtpComputeTile2[1] = 255
-                rtpComputeTile2[2] = 0
-
-                rtpComputeTile3[0] = 50
-                rtpComputeTile3[1] = 255
-                rtpComputeTile3[2] = 0
-
-                rtpComputeTile4[0] = 50
-                rtpComputeTile4[1] = 255
-                rtpComputeTile4[2] = 0
-
-                rtpComputeTile5[0] = 50
-                rtpComputeTile5[1] = 255
-                rtpComputeTile5[2] = 0
-
-                in_task = shim_dma_single_bd_task(
-                    inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True
-                )
-                out_task = shim_dma_single_bd_task(
-                    outOOB_L2L3,
-                    outTensor,
-                    sizes=[1, 1, 1, tensorSize],
-                    issue_token=True,
-                )
-
-                dma_start_task(in_task, out_task)
-                dma_await_task(in_task, out_task)
-
-    # print(ctx.module.operation.verify())
+            dma_start_task(in_task, out_task)
+            dma_await_task(in_task, out_task)
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = 512 if (len(sys.argv) != 4) else int(sys.argv[2])
+    height = 9 if (len(sys.argv) != 4) else int(sys.argv[3])
+except ValueError:
+    print("Argument has inappropriate value")
+with mlir_mod_ctx() as ctx:
+    #print(ctx.module.operation.verify())
+    color_threshold(dev, width, height)
     print(ctx.module)
 
-
-color_threshold()
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
index bbbdc586b6..75b22602be 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
@@ -12,277 +12,280 @@
 from aie.helpers.dialects.ext.scf import _for as range_
 from aie.extras.context import mlir_mod_ctx
 
-width = 64
-height = 36
-if len(sys.argv) == 3:
-    width = int(sys.argv[1])
-    height = int(sys.argv[2])
 
-heightMinus1 = height - 1
-lineWidth = width
-lineWidthInBytes = width * 4
-tensorSize = width * height * 4  # 4 channels
-
-enableTrace = False
-traceSizeInBytes = 8192
-traceSizeInInt32s = traceSizeInBytes // 4
-
-
-def edge_detect():
-    with mlir_mod_ctx() as ctx:
-
-        @device(AIEDevice.npu1_1col)
-        def device_body():
-            line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]
-            line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
-            tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]]
-
-            tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
-            tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]]
-
-            # AIE Core Function declarations
-            rgba2gray_line = external_func(
-                "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32]
-            )
-            filter2d_line = external_func(
-                "filter2dLine",
-                inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty],
-            )
-            threshold_line = external_func(
-                "thresholdLine",
-                inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
-            )
-            gray2rgba_line = external_func(
-                "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
-            )
-            add_weighted_line = external_func(
-                "addWeightedLine",
-                inputs=[
-                    line_bytes_ty,
-                    line_bytes_ty,
-                    line_bytes_ty,
-                    np.int32,
-                    np.int16,
-                    np.int16,
-                    np.int8,
-                ],
-            )
-
-            # Tile declarations
-            ShimTile = tile(0, 0)
-            MemTile = tile(0, 1)
-            ComputeTile2 = tile(0, 2)
-            ComputeTile3 = tile(0, 3)
-            ComputeTile4 = tile(0, 4)
-            ComputeTile5 = tile(0, 5)
-
-            # AIE-array data movement with object fifos
-            # Input
-            inOF_L3L2 = object_fifo(
-                "inOF_L3L2",
-                ShimTile,
-                [ComputeTile2, MemTile],
-                [2, 2, 7],
+def edge_detect(dev, width, height):
+    heightMinus1 = height - 1
+    lineWidth = width
+    lineWidthInBytes = width * 4
+    tensorSize = width * height * 4  # 4 channels
+
+    enableTrace = False
+    traceSizeInBytes = 8192
+    traceSizeInInt32s = traceSizeInBytes // 4
+
+    @device(dev)
+    def device_body():
+        line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]
+        line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]]
+        tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]]
+
+        tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]]
+        tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]]
+
+        # AIE Core Function declarations
+        rgba2gray_line = external_func(
+            "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32]
+        )
+        filter2d_line = external_func(
+            "filter2dLine",
+            inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty],
+        )
+        threshold_line = external_func(
+            "thresholdLine",
+            inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+        )
+        gray2rgba_line = external_func(
+            "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
+        )
+        add_weighted_line = external_func(
+            "addWeightedLine",
+            inputs=[
                 line_bytes_ty,
-            )
-            inOF_L2L1 = object_fifo(
-                "inOF_L2L1",
-                MemTile,
-                ComputeTile5,
-                7,
                 line_bytes_ty,
-            )
-            object_fifo_link(inOF_L3L2, inOF_L2L1)
-
-            # Output
-            outOF_L2L3 = object_fifo(
-                "outOF_L2L3",
-                MemTile,
-                ShimTile,
-                2,
-                line_bytes_ty,
-            )
-            outOF_L1L2 = object_fifo(
-                "outOF_L1L2",
-                ComputeTile5,
-                MemTile,
-                2,
                 line_bytes_ty,
+                np.int32,
+                np.int16,
+                np.int16,
+                np.int8,
+            ],
+        )
+
+        # Tile declarations
+        ShimTile = tile(0, 0)
+        MemTile = tile(0, 1)
+        ComputeTile2 = tile(0, 2)
+        ComputeTile3 = tile(0, 3)
+        ComputeTile4 = tile(0, 4)
+        ComputeTile5 = tile(0, 5)
+
+        # AIE-array data movement with object fifos
+        # Input
+        inOF_L3L2 = object_fifo(
+            "inOF_L3L2",
+            ShimTile,
+            [ComputeTile2, MemTile],
+            [2, 2, 7],
+            line_bytes_ty,
+        )
+        inOF_L2L1 = object_fifo(
+            "inOF_L2L1",
+            MemTile,
+            ComputeTile5,
+            7,
+            line_bytes_ty,
+        )
+        object_fifo_link(inOF_L3L2, inOF_L2L1)
+
+        # Output
+        outOF_L2L3 = object_fifo(
+            "outOF_L2L3",
+            MemTile,
+            ShimTile,
+            2,
+            line_bytes_ty,
+        )
+        outOF_L1L2 = object_fifo(
+            "outOF_L1L2",
+            ComputeTile5,
+            MemTile,
+            2,
+            line_bytes_ty,
+        )
+        object_fifo_link(outOF_L1L2, outOF_L2L3)
+
+        # Intermediate
+        OF_2to3 = object_fifo(
+            "OF_2to3",
+            ComputeTile2,
+            ComputeTile3,
+            4,
+            line_ty,
+        )
+        OF_3to4 = object_fifo(
+            "OF_3to4",
+            ComputeTile3,
+            ComputeTile4,
+            2,
+            line_ty,
+        )
+        OF_4to5 = object_fifo(
+            "OF_4to5",
+            ComputeTile4,
+            ComputeTile5,
+            2,
+            line_ty,
+        )
+        OF_5to5 = object_fifo(
+            "OF_5to5",
+            ComputeTile5,
+            ComputeTile5,
+            1,
+            line_bytes_ty,
+        )
+
+        # Set up compute tiles
+
+        # Compute tile 2
+        @core(ComputeTile2, "rgba2gray.cc.o")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1)
+
+                rgba2gray_line(elem_in, elem_out, lineWidth)
+
+                inOF_L3L2.release(ObjectFifoPort.Consume, 1)
+                OF_2to3.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 3
+        @core(ComputeTile3, "filter2d.cc.o")
+        def core_body():
+            v0 = 0
+            v1 = 4096
+            v_minus4 = -16384
+            initial_value = np.array(
+                [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16
             )
-            object_fifo_link(outOF_L1L2, outOF_L2L3)
-
-            # Intermediate
-            OF_2to3 = object_fifo(
-                "OF_2to3",
-                ComputeTile2,
-                ComputeTile3,
-                4,
-                line_ty,
-            )
-            OF_3to4 = object_fifo(
-                "OF_3to4",
+            kernel = buffer(
                 ComputeTile3,
-                ComputeTile4,
-                2,
-                line_ty,
+                np.ndarray[(3, 3), np.dtype[np.int16]],
+                "kernel",
+                initial_value=initial_value,
             )
-            OF_4to5 = object_fifo(
-                "OF_4to5",
-                ComputeTile4,
-                ComputeTile5,
-                2,
-                line_ty,
-            )
-            OF_5to5 = object_fifo(
-                "OF_5to5",
-                ComputeTile5,
-                ComputeTile5,
-                1,
-                line_bytes_ty,
-            )
-
-            # Set up compute tiles
-
-            # Compute tile 2
-            @core(ComputeTile2, "rgba2gray.cc.o")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1)
-
-                    rgba2gray_line(elem_in, elem_out, lineWidth)
-
-                    inOF_L3L2.release(ObjectFifoPort.Consume, 1)
-                    OF_2to3.release(ObjectFifoPort.Produce, 1)
 
-            # Compute tile 3
-            @core(ComputeTile3, "filter2d.cc.o")
-            def core_body():
-                v0 = 0
-                v1 = 4096
-                v_minus4 = -16384
-                initial_value = np.array(
-                    [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16
+            for _ in range_(sys.maxsize):
+                # Preamble : Top Border
+                elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
+                elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                filter2d_line(
+                    elems_in_pre[0],
+                    elems_in_pre[0],
+                    elems_in_pre[1],
+                    elem_pre_out,
+                    lineWidth,
+                    kernel,
                 )
-                kernel = buffer(
-                    ComputeTile3,
-                    np.ndarray[(3, 3), np.dtype[np.int16]],
-                    "kernel",
-                    initial_value=initial_value,
-                )
-
-                for _ in range_(sys.maxsize):
-                    # Preamble : Top Border
-                    elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
-                    elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
-                    filter2d_line(
-                        elems_in_pre[0],
-                        elems_in_pre[0],
-                        elems_in_pre[1],
-                        elem_pre_out,
-                        lineWidth,
-                        kernel,
-                    )
-                    OF_3to4.release(ObjectFifoPort.Produce, 1)
+                OF_3to4.release(ObjectFifoPort.Produce, 1)
 
-                    # Steady State : Middle
-                    for _ in range_(1, heightMinus1):
-                        elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3)
-                        elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
-                        filter2d_line(
-                            elems_in[0],
-                            elems_in[1],
-                            elems_in[2],
-                            elem_out,
-                            lineWidth,
-                            kernel,
-                        )
-                        OF_2to3.release(ObjectFifoPort.Consume, 1)
-                        OF_3to4.release(ObjectFifoPort.Produce, 1)
-
-                    # Postamble : Bottom Border
-                    elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
-                    elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                # Steady State : Middle
+                for _ in range_(1, heightMinus1):
+                    elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3)
+                    elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
                     filter2d_line(
-                        elems_in_post[0],
-                        elems_in_post[1],
-                        elems_in_post[1],
-                        elem_post_out,
+                        elems_in[0],
+                        elems_in[1],
+                        elems_in[2],
+                        elem_out,
                         lineWidth,
                         kernel,
                     )
-                    OF_2to3.release(ObjectFifoPort.Consume, 2)
+                    OF_2to3.release(ObjectFifoPort.Consume, 1)
                     OF_3to4.release(ObjectFifoPort.Produce, 1)
 
-            # Compute tile 4
-            @core(ComputeTile4, "threshold.cc.o")
-            def core_body():
-                v_thr = 10
-                v_max = 255
-                v_typ = 0
-
-                for _ in range_(sys.maxsize):
-                    elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1)
-
-                    threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ)
-
-                    OF_3to4.release(ObjectFifoPort.Consume, 1)
-                    OF_4to5.release(ObjectFifoPort.Produce, 1)
-
-            # Compute tile 5
-            @core(ComputeTile5, "combined_gray2rgba_addWeighted.a")
-            def core_body():
-                for _ in range_(sys.maxsize):
-                    elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1)
-
-                    gray2rgba_line(elem_in, elem_out, lineWidth)
-
-                    OF_4to5.release(ObjectFifoPort.Consume, 1)
-                    OF_5to5.release(ObjectFifoPort.Produce, 1)
-
-                    elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1)
-                    elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1)
-                    elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1)
-
-                    alpha = 16384
-                    beta = 16384
-                    gamma = 0
-
-                    add_weighted_line(
-                        elem_in1,
-                        elem_in2,
-                        elem_out2,
-                        lineWidthInBytes,
-                        alpha,
-                        beta,
-                        gamma,
-                    )
-
-                    OF_5to5.release(ObjectFifoPort.Consume, 1)
-                    inOF_L2L1.release(ObjectFifoPort.Consume, 1)
-                    outOF_L1L2.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty)
-            def sequence(I, B, O):
-                in_task = shim_dma_single_bd_task(
-                    inOF_L3L2, I, sizes=[1, 1, 1, tensorSize]
+                # Postamble : Bottom Border
+                elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2)
+                elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1)
+                filter2d_line(
+                    elems_in_post[0],
+                    elems_in_post[1],
+                    elems_in_post[1],
+                    elem_post_out,
+                    lineWidth,
+                    kernel,
                 )
-                out_task = shim_dma_single_bd_task(
-                    outOF_L2L3,
-                    O,
-                    sizes=[1, 1, 1, tensorSize],
-                    issue_token=True,
+                OF_2to3.release(ObjectFifoPort.Consume, 2)
+                OF_3to4.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 4
+        @core(ComputeTile4, "threshold.cc.o")
+        def core_body():
+            v_thr = 10
+            v_max = 255
+            v_typ = 0
+
+            for _ in range_(sys.maxsize):
+                elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1)
+
+                threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ)
+
+                OF_3to4.release(ObjectFifoPort.Consume, 1)
+                OF_4to5.release(ObjectFifoPort.Produce, 1)
+
+        # Compute tile 5
+        @core(ComputeTile5, "combined_gray2rgba_addWeighted.a")
+        def core_body():
+            for _ in range_(sys.maxsize):
+                elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1)
+
+                gray2rgba_line(elem_in, elem_out, lineWidth)
+
+                OF_4to5.release(ObjectFifoPort.Consume, 1)
+                OF_5to5.release(ObjectFifoPort.Produce, 1)
+
+                elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1)
+                elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1)
+                elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1)
+
+                alpha = 16384
+                beta = 16384
+                gamma = 0
+
+                add_weighted_line(
+                    elem_in1,
+                    elem_in2,
+                    elem_out2,
+                    lineWidthInBytes,
+                    alpha,
+                    beta,
+                    gamma,
                 )
 
-                dma_start_task(in_task, out_task)
-                dma_await_task(out_task)
-                dma_free_task(in_task)
-
-    #    print(ctx.module.operation.verify())
-    print(ctx.module)
+                OF_5to5.release(ObjectFifoPort.Consume, 1)
+                inOF_L2L1.release(ObjectFifoPort.Consume, 1)
+                outOF_L1L2.release(ObjectFifoPort.Produce, 1)
 
+        # To/from AIE-array data movement
+        @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty)
+        def sequence(I, B, O):
+            in_task = shim_dma_single_bd_task(
+                inOF_L3L2, I, sizes=[1, 1, 1, tensorSize]
+            )
+            out_task = shim_dma_single_bd_task(
+                outOF_L2L3,
+                O,
+                sizes=[1, 1, 1, tensorSize],
+                issue_token=True,
+            )
 
-edge_detect()
+            dma_start_task(in_task, out_task)
+            dma_await_task(out_task)
+            dma_free_task(in_task)
+
+try:
+    device_name = str(sys.argv[1])
+    if device_name == "npu":
+        dev = AIEDevice.npu1_1col
+    elif device_name == "npu2":
+        dev = AIEDevice.npu2
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+    width = 36 if (len(sys.argv) != 4) else int(sys.argv[2])
+    height = 64 if (len(sys.argv) != 4) else int(sys.argv[3])
+except ValueError:
+    print("Argument has inappropriate value")
+with mlir_mod_ctx() as ctx:
+    #print(ctx.module.operation.verify())
+    edge_detect(dev, width, height)
+    print(ctx.module)

From 75ef8d72141e1237f2d86c1b1d210436f6c8bbb1 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jmelber@xilinx.com>
Date: Wed, 4 Dec 2024 12:53:05 -0700
Subject: [PATCH 32/35] Vision tests use return not FileCheck

---
 programming_examples/vision/color_threshold/run_makefile.lit  | 3 +--
 .../vision/color_threshold/run_makefile_alt.lit               | 3 +--
 programming_examples/vision/edge_detect/run_makefile.lit      | 4 +---
 programming_examples/vision/edge_detect/run_makefile_alt.lit  | 3 +--
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/programming_examples/vision/color_threshold/run_makefile.lit b/programming_examples/vision/color_threshold/run_makefile.lit
index c6e18a3da4..40fc6f201d 100644
--- a/programming_examples/vision/color_threshold/run_makefile.lit
+++ b/programming_examples/vision/color_threshold/run_makefile.lit
@@ -5,6 +5,5 @@
  //
  // RUN: make -f %S/Makefile clean
  // RUN: make -f %S/Makefile 
- // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-  // CHECK: PASS!
+ // RUN: %run_on_npu make -f %S/Makefile run
   
\ No newline at end of file
diff --git a/programming_examples/vision/color_threshold/run_makefile_alt.lit b/programming_examples/vision/color_threshold/run_makefile_alt.lit
index 9f5617f16c..19bd34a2d0 100644
--- a/programming_examples/vision/color_threshold/run_makefile_alt.lit
+++ b/programming_examples/vision/color_threshold/run_makefile_alt.lit
@@ -7,6 +7,5 @@
 // RUN: cd test_alt
 // RUN: make -f %S/Makefile clean
 // RUN: env use_alt=1 make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// CHECK: PASS!
+// RUN: %run_on_npu make -f %S/Makefile run
   
\ No newline at end of file
diff --git a/programming_examples/vision/edge_detect/run_makefile.lit b/programming_examples/vision/edge_detect/run_makefile.lit
index c6e18a3da4..2368db78ff 100644
--- a/programming_examples/vision/edge_detect/run_makefile.lit
+++ b/programming_examples/vision/edge_detect/run_makefile.lit
@@ -5,6 +5,4 @@
  //
  // RUN: make -f %S/Makefile clean
  // RUN: make -f %S/Makefile 
- // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-  // CHECK: PASS!
-  
\ No newline at end of file
+ // RUN: %run_on_npu make -f %S/Makefile run
\ No newline at end of file
diff --git a/programming_examples/vision/edge_detect/run_makefile_alt.lit b/programming_examples/vision/edge_detect/run_makefile_alt.lit
index 9f5617f16c..19bd34a2d0 100644
--- a/programming_examples/vision/edge_detect/run_makefile_alt.lit
+++ b/programming_examples/vision/edge_detect/run_makefile_alt.lit
@@ -7,6 +7,5 @@
 // RUN: cd test_alt
 // RUN: make -f %S/Makefile clean
 // RUN: env use_alt=1 make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// CHECK: PASS!
+// RUN: %run_on_npu make -f %S/Makefile run
   
\ No newline at end of file

From cbf116eb794b6493d3f693d086a97f3597e156a9 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Wed, 4 Dec 2024 12:55:06 -0700
Subject: [PATCH 33/35] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../color_threshold/aie2_colorThreshold.py    | 47 +++++--------------
 .../vision/edge_detect/aie2_edgeDetect.py     |  3 +-
 .../vision/edge_detect/aie2_edgeDetect_alt.py |  7 ++-
 3 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index c8eb7e8657..277221f309 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -46,24 +46,12 @@ def device_body():
         # AIE-array data movement with object fifos
 
         # Input RGBA broadcast + memtile for skip
-        inOOB_L3L2 = object_fifo(
-            "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
-        )
-        inOOB_L2L1_0 = object_fifo(
-            "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
-        )
-        inOOB_L2L1_1 = object_fifo(
-            "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
-        )
-        inOOB_L2L1_2 = object_fifo(
-            "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
-        )
-        inOOB_L2L1_3 = object_fifo(
-            "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
-        )
-        of_offsets = [
-            np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
-        ]
+        inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty)
+        inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty)
+        inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty)
+        inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty)
+        inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty)
+        of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)]
         object_fifo_link(
             inOOB_L3L2,
             [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
@@ -72,21 +60,11 @@ def device_body():
         )
 
         # Output RGBA
-        outOOB_L2L3 = object_fifo(
-            "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
-        )
-        outOOB_L1L2_0 = object_fifo(
-            "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_1 = object_fifo(
-            "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_2 = object_fifo(
-            "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_3 = object_fifo(
-            "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
-        )
+        outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty)
+        outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty)
+        outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty)
+        outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty)
+        outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty)
         object_fifo_link(
             [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
             outOOB_L2L3,
@@ -256,6 +234,7 @@ def sequence(inTensor, notUsed, outTensor):
             )
             dma_wait(inOOB_L3L2, outOOB_L2L3)
 
+
 try:
     device_name = str(sys.argv[1])
     if device_name == "npu":
@@ -269,7 +248,7 @@ def sequence(inTensor, notUsed, outTensor):
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    #print(ctx.module.operation.verify())
+    # print(ctx.module.operation.verify())
     color_threshold(dev, width, height)
     print(ctx.module)
 
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index 23565c8b47..4efc78dbf7 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -274,6 +274,7 @@ def sequence(I, B, O):
             # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all
             dma_wait(outOF_L2L3)
 
+
 try:
     device_name = str(sys.argv[1])
     if device_name == "npu":
@@ -287,6 +288,6 @@ def sequence(I, B, O):
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    #print(ctx.module.operation.verify())
+    # print(ctx.module.operation.verify())
     edge_detect(dev, width, height)
     print(ctx.module)
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
index 75b22602be..75a34e5533 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py
@@ -259,9 +259,7 @@ def core_body():
         # To/from AIE-array data movement
         @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty)
         def sequence(I, B, O):
-            in_task = shim_dma_single_bd_task(
-                inOF_L3L2, I, sizes=[1, 1, 1, tensorSize]
-            )
+            in_task = shim_dma_single_bd_task(inOF_L3L2, I, sizes=[1, 1, 1, tensorSize])
             out_task = shim_dma_single_bd_task(
                 outOF_L2L3,
                 O,
@@ -273,6 +271,7 @@ def sequence(I, B, O):
             dma_await_task(out_task)
             dma_free_task(in_task)
 
+
 try:
     device_name = str(sys.argv[1])
     if device_name == "npu":
@@ -286,6 +285,6 @@ def sequence(I, B, O):
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    #print(ctx.module.operation.verify())
+    # print(ctx.module.operation.verify())
     edge_detect(dev, width, height)
     print(ctx.module)

From d8982633409c52be758a0145b9480bca1fde48ed Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Wed, 4 Dec 2024 12:55:55 -0700
Subject: [PATCH 34/35] Apply code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../vision/color_threshold/aie2_colorThreshold.py                | 1 -
 1 file changed, 1 deletion(-)

diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index 277221f309..1bd250c281 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -251,4 +251,3 @@ def sequence(inTensor, notUsed, outTensor):
     # print(ctx.module.operation.verify())
     color_threshold(dev, width, height)
     print(ctx.module)
-

From 9278ee05d431ab4f207c050968ea6ba2e7a8d473 Mon Sep 17 00:00:00 2001
From: Joseph Melber <jgmelber@gmail.com>
Date: Wed, 4 Dec 2024 12:56:31 -0700
Subject: [PATCH 35/35] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 .../aie2_colorThreshold_alt.py                | 48 +++++--------------
 1 file changed, 13 insertions(+), 35 deletions(-)

diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
index b676b9168e..84e2de3895 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py
@@ -46,24 +46,12 @@ def device_body():
         # AIE-array data movement with object fifos
 
         # Input RGBA broadcast + memtile for skip
-        inOOB_L3L2 = object_fifo(
-            "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty
-        )
-        inOOB_L2L1_0 = object_fifo(
-            "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty
-        )
-        inOOB_L2L1_1 = object_fifo(
-            "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty
-        )
-        inOOB_L2L1_2 = object_fifo(
-            "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty
-        )
-        inOOB_L2L1_3 = object_fifo(
-            "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty
-        )
-        of_offsets = [
-            np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)
-        ]
+        inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty)
+        inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty)
+        inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty)
+        inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty)
+        inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty)
+        of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)]
         object_fifo_link(
             inOOB_L3L2,
             [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3],
@@ -72,21 +60,11 @@ def device_body():
         )
 
         # Output RGBA
-        outOOB_L2L3 = object_fifo(
-            "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty
-        )
-        outOOB_L1L2_0 = object_fifo(
-            "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_1 = object_fifo(
-            "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_2 = object_fifo(
-            "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty
-        )
-        outOOB_L1L2_3 = object_fifo(
-            "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty
-        )
+        outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty)
+        outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty)
+        outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty)
+        outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty)
+        outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty)
         object_fifo_link(
             [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3],
             outOOB_L2L3,
@@ -254,6 +232,7 @@ def sequence(inTensor, notUsed, outTensor):
             dma_start_task(in_task, out_task)
             dma_await_task(in_task, out_task)
 
+
 try:
     device_name = str(sys.argv[1])
     if device_name == "npu":
@@ -267,7 +246,6 @@ def sequence(inTensor, notUsed, outTensor):
 except ValueError:
     print("Argument has inappropriate value")
 with mlir_mod_ctx() as ctx:
-    #print(ctx.module.operation.verify())
+    # print(ctx.module.operation.verify())
     color_threshold(dev, width, height)
     print(ctx.module)
-