From c7fdd9e85e7bb45b2a925e38e9ffb23994227a6a Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 10 Oct 2024 14:12:52 -0600 Subject: [PATCH 01/35] Strix passthrough --- .../basic/passthrough_kernel/Makefile | 12 ++++++++++-- .../basic/passthrough_kernel/aie2.py | 14 ++++++++++++-- programming_examples/makefile-common | 1 + 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index 361cfb2d3a..95aea8ad8c 100644 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -12,6 +12,7 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common +devicename = npu targetname = passThroughKernel VPATH := ${srcdir}/../../../aie_kernels/generic data_size = 4096 @@ -24,7 +25,7 @@ all: build/final_${data_size}.xclbin build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< ${data_size} 0 > $@ + python3 $< ${devicename} ${data_size} 0 > $@ build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} @@ -32,7 +33,14 @@ build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py build/passThrough.cc.o: passThrough.cc mkdir -p ${@D} - cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} + ifeq ($(devicename), npu) + cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} + else ifeq ($(devicename), npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} + else + echo "Device type not supported" + endif + endif build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 2bfdbb3066..c86d432622 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -19,8 +19,18 @@ def passthroughKernel(vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - - @device(AIEDevice.npu1_1col) + + if len(sys.argv) != 3: + raise ValueError("[ERROR] Need command line arguments (Device name, Vector size)") + + if sys.argv[1] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[1] == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) def device_body(): # define types vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common index 4523fe7c5f..a8ee6431ca 100644 --- a/programming_examples/makefile-common +++ b/programming_examples/makefile-common @@ -11,6 +11,7 @@ CHESS_FLAGS = -P ${AIE_INCLUDE_DIR} CHESSCCWRAP1_FLAGS = aie -I ${AIETOOLS_DIR}/include CHESSCCWRAP2_FLAGS = aie2 -I ${AIETOOLS_DIR}/include +CHESSCCWRAP2P_FLAGS = aie2p -I ${AIETOOLS_DIR}/include PEANOWRAP2_FLAGS = -O2 -v -std=c++20 --target=aie2-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIETOOLS_DIR}/include TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no) From a287f46e2d298b8be79a197aae11efd5d7247e35 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Fri, 11 Oct 2024 10:09:57 -0600 Subject: [PATCH 02/35] Fixup test --- .../basic/passthrough_kernel/Makefile | 21 ++++++++-------- .../basic/passthrough_kernel/aie2.py | 25 ++++++++----------- .../passthrough_kernel/run_strix_makefile.lit | 10 ++++++++ 3 files changed, 31 insertions(+), 25 deletions(-) mode change 100644 => 100755 programming_examples/basic/passthrough_kernel/Makefile mode change 100644 => 100755 programming_examples/basic/passthrough_kernel/aie2.py create mode 100755 programming_examples/basic/passthrough_kernel/run_strix_makefile.lit diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile old mode 100644 new mode 100755 index 95aea8ad8c..5f3f234acf --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -12,7 +12,7 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) include ${srcdir}/../../makefile-common -devicename = npu +device = npu targetname = passThroughKernel VPATH := ${srcdir}/../../../aie_kernels/generic data_size = 4096 @@ -25,22 +25,21 @@ all: build/final_${data_size}.xclbin build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< ${devicename} ${data_size} 0 > $@ + python3 $< ${device} ${data_size} 0 > $@ build/aie_trace__lineBased_8b_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< ${data_size} ${trace_size} > $@ + python3 $< ${device} ${data_size} ${trace_size} > $@ build/passThrough.cc.o: passThrough.cc mkdir -p ${@D} - ifeq ($(devicename), npu) - cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} - else ifeq ($(devicename), npu2) - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} - else - echo "Device type not supported" - endif - endif +ifeq ($(device),npu) + cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py old mode 100644 new mode 100755 index c86d432622..6ea3c4bb72 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -16,20 +16,10 @@ import aie.utils.trace as trace_utils -def passthroughKernel(vector_size, trace_size): +def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - if len(sys.argv) != 3: - raise ValueError("[ERROR] Need command line arguments (Device name, Vector size)") - - if sys.argv[1] == "npu": - dev = AIEDevice.npu1_1col - elif sys.argv[1] == "npu2": - dev = AIEDevice.npu2 - else: - raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - @device(dev) def device_body(): # define types @@ -95,13 +85,20 @@ def sequence(inTensor, outTensor, notUsed): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - passthroughKernel(vector_size, trace_size) + passthroughKernel(dev, vector_size, trace_size) print(ctx.module) diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit new file mode 100755 index 0000000000..1818ba9813 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_npu make -f %S/Makefile run device=npu2 | FileCheck %s +// CHECK: Running... +// CHECK: PASS! From 133ce6c536a198b9677fbe6f3d92e49eb2fc098a Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 08:54:06 -0600 Subject: [PATCH 03/35] Dog Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- programming_examples/basic/passthrough_kernel/aie2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 6ea3c4bb72..3468f7e944 100755 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -19,7 +19,6 @@ def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - @device(dev) def device_body(): # define types From fdbde789b3b3bc4dd50cfc96132fc27fc088dc1f Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 08:57:48 -0600 Subject: [PATCH 04/35] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- programming_examples/basic/passthrough_kernel/aie2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 3468f7e944..b225295898 100755 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -19,6 +19,7 @@ def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors + @device(dev) def device_body(): # define types From a576bd2dc56d1cd7add925968f92c1fc921e72b9 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 19:50:11 -0600 Subject: [PATCH 05/35] Fixup aiecc.py for npu/npu2 --- programming_examples/basic/passthrough_kernel/Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index 5f3f234acf..7e74ef39c1 100755 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -43,14 +43,21 @@ endif build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +endif build/final_trace_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ +ifeq ($(device),npu) --no-xchesscc --no-xbridge \ +endif --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) ${targetname}_${data_size}.exe: ${srcdir}/test.cpp From 83d705139c44dacedccbd55958afd557877a7e22 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 19:57:14 -0600 Subject: [PATCH 06/35] Try enabling run_on_npu2 --- .../passthrough_kernel/run_strix_makefile.lit | 2 +- programming_examples/lit.cfg.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index 1818ba9813..81c8264aa0 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -5,6 +5,6 @@ // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 -// RUN: %run_on_npu make -f %S/Makefile run device=npu2 | FileCheck %s +// RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s // CHECK: Running... // CHECK: PASS! diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 97e1246045..2368b262ea 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -46,6 +46,7 @@ llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python")) run_on_npu = "echo" +run_on_npu2 = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -131,18 +132,25 @@ result = result.stdout.decode("utf-8").split("\n") # Starting with Linux 6.8 the format is like "[0000:66:00.1] : RyzenAI-npu1" # Starting with Linux 6.10 the format is like "|[0000:41:00.1] ||RyzenAI-npu1 |" - p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu\d))") + p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu1\d))") + p2 = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu4\d))") for l in result: m = p.match(l) - if not m: + m2 = p2.match(l) + if not m and not m2: continue print("Found Ryzen AI device:", m.group(1)) if len(m.groups()) == 3: print("\tmodel:", m.group(3)) config.available_features.add("ryzen_ai") - run_on_npu = ( - f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" - ) + if m: + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) + if m2: + run_on_npu2 = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) break except: print("Failed to run xrt-smi") From 096d0bc1ceab45d3b1a9b54167faaab7a7a35d41 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 20:07:28 -0600 Subject: [PATCH 07/35] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- programming_examples/lit.cfg.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 2368b262ea..07536d000b 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -144,13 +144,13 @@ print("\tmodel:", m.group(3)) config.available_features.add("ryzen_ai") if m: - run_on_npu = ( - f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" - ) + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) if m2: - run_on_npu2 = ( - f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" - ) + run_on_npu2 = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" + ) break except: print("Failed to run xrt-smi") From 6ea15fa88ab0e58f6022acdcb7c2211ac28554be Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 20:24:49 -0600 Subject: [PATCH 08/35] Add lit fixup --- programming_examples/lit.cfg.py | 1 + programming_examples/lit.site.cfg.py.in | 2 ++ 2 files changed, 3 insertions(+) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 07536d000b..999351429b 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -159,6 +159,7 @@ print("xrt not found") config.substitutions.append(("%run_on_npu", run_on_npu)) +config.substitutions.append(("%run_on_npu2", run_on_npu2)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1" diff --git a/programming_examples/lit.site.cfg.py.in b/programming_examples/lit.site.cfg.py.in index 22a367d1fc..3ba7a457f1 100755 --- a/programming_examples/lit.site.cfg.py.in +++ b/programming_examples/lit.site.cfg.py.in @@ -69,6 +69,8 @@ if lit.util.pythonize_bool("@AIETools_AIE_FOUND@"): config.vitis_components.append("AIE") if lit.util.pythonize_bool("@AIETools_AIE2_FOUND@"): config.vitis_components.append("AIE2") +if lit.util.pythonize_bool("@AIETools_AIE2P_FOUND@"): + config.vitis_components.append("AIE2P") # Support substitution of the tools_dir with user parameters. This is # used when we can't determine the tool dir at configuration time. From d1f543f2957cd86ea0157cfd1cceb7050d13d18b Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 20:29:27 -0600 Subject: [PATCH 09/35] Fixup lit cfg --- programming_examples/lit.cfg.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 999351429b..fcba808776 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -132,22 +132,20 @@ result = result.stdout.decode("utf-8").split("\n") # Starting with Linux 6.8 the format is like "[0000:66:00.1] : RyzenAI-npu1" # Starting with Linux 6.10 the format is like "|[0000:41:00.1] ||RyzenAI-npu1 |" - p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu1\d))") - p2 = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu4\d))") + p = re.compile(r"[\|]?(\[.+:.+:.+\]).+(Phoenix|RyzenAI-(npu\d))") for l in result: m = p.match(l) - m2 = p2.match(l) - if not m and not m2: + if not m: continue print("Found Ryzen AI device:", m.group(1)) if len(m.groups()) == 3: print("\tmodel:", m.group(3)) config.available_features.add("ryzen_ai") - if m: + if m.group(3) == "npu1": run_on_npu = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) - if m2: + if m.group(3) == "npu4": run_on_npu2 = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) From 74c689cb5e7179d09d065e9234370c24909f1b49 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 20:51:03 -0600 Subject: [PATCH 10/35] Fix Makefile --- programming_examples/basic/passthrough_kernel/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index 7e74ef39c1..246da39f43 100755 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -54,11 +54,15 @@ endif build/final_trace_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/passThrough.cc.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ ifeq ($(device),npu) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ --no-xchesscc --no-xbridge \ -endif --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.txt $(<:%=../%) +endif + ${targetname}_${data_size}.exe: ${srcdir}/test.cpp rm -rf _build From 85e19eeee79ef748ee3b240b1b7b4de5846705ab Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 20:55:22 -0600 Subject: [PATCH 11/35] Don't pollute source tree --- programming_examples/basic/passthrough_kernel/run_makefile.lit | 2 ++ .../basic/passthrough_kernel/run_strix_makefile.lit | 2 ++ 2 files changed, 4 insertions(+) diff --git a/programming_examples/basic/passthrough_kernel/run_makefile.lit b/programming_examples/basic/passthrough_kernel/run_makefile.lit index e8213c5d18..4619c215e6 100644 --- a/programming_examples/basic/passthrough_kernel/run_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_makefile.lit @@ -3,6 +3,8 @@ // // REQUIRES: ryzen_ai, peano // +// RUN: mkdir -p test +// RUN: cd test // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index 81c8264aa0..456e08c547 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -3,6 +3,8 @@ // // REQUIRES: ryzen_ai, chess // +// RUN: mkdir -p test_stx +// RUN: cd test_stx // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 // RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s From 332a00ca6322903a8105ae4704a8190b49da7558 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Thu, 24 Oct 2024 21:11:09 -0600 Subject: [PATCH 12/35] fix --- programming_examples/lit.cfg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index fcba808776..efa77009a7 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -141,11 +141,11 @@ if len(m.groups()) == 3: print("\tmodel:", m.group(3)) config.available_features.add("ryzen_ai") - if m.group(3) == "npu1": + if str(m.group(3)) == "npu1": run_on_npu = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) - if m.group(3) == "npu4": + if str(m.group(3)) == "npu4": run_on_npu2 = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) From 797fda9d5cf1a50f85e6705ba81280abf46645af Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Fri, 25 Oct 2024 13:56:14 -0600 Subject: [PATCH 13/35] Rely on test return value for PASS/fail --- .../basic/passthrough_kernel/run_strix_makefile.lit | 2 -- 1 file changed, 2 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index 456e08c547..fc1d046b74 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -8,5 +8,3 @@ // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 // RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s -// CHECK: Running... -// CHECK: PASS! From 23ca4b5c2fc9aae05f6ec6b59d4e0cd8512b798d Mon Sep 17 00:00:00 2001 From: "jgmelber@gmail.com" Date: Mon, 2 Dec 2024 23:12:53 +0000 Subject: [PATCH 14/35] Generic mul kernel test on Strix --- aie_kernels/aie2/mul.cc | 6 +++--- .../basic/vector_scalar_mul/Makefile | 11 +++++++++-- .../basic/vector_scalar_mul/aie2.py | 19 +++++++++++++------ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc index 5745f364dc..0c4290034c 100755 --- a/aie_kernels/aie2/mul.cc +++ b/aie_kernels/aie2/mul.cc @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#define __AIENGINE__ 2 -#define NOCPP -#define __AIEARCH__ 20 +//#define __AIENGINE__ 2 +//#define NOCPP +//#define __AIEARCH__ 20 #include #include diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index 6d8906bd70..2a4c890c02 100644 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -14,6 +14,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu targetname = vectorScalar data_size = 4096 trace_size = 8192 @@ -25,19 +26,25 @@ kristof: build/insts_${data_size}.txt build/%.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) ifeq ($(CHESS), true) cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}; else cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}; endif +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/aie_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< ${data_size} 0 > $@ + python3 $< ${device} ${data_size} 0 > $@ build/aie_trace_${data_size}.mlir: ${srcdir}/aie2.py mkdir -p ${@D} - python3 $< ${data_size} ${trace_size} > $@ + python3 $< ${device} ${data_size} ${trace_size} > $@ #build/insts_${data_size}.txt: build/final_${data_size}.xclbin build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index 1d367e5aab..0617dafdad 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -16,7 +16,7 @@ import aie.utils.trace as trace_utils -def my_vector_scalar(vector_size, trace_size): +def my_vector_scalar(dev, vector_size, trace_size): N = vector_size N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors @@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size): vectorized = True - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): tensor_ty = np.ndarray[(N,), np.dtype[np.int16]] tile_ty = np.ndarray[(n,), np.dtype[np.int16]] @@ -93,13 +93,20 @@ def sequence(A, F, C): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - my_vector_scalar(vector_size, trace_size) - print(ctx.module) + my_vector_scalar(dev, vector_size, trace_size) +print(ctx.module) From 28c52f0e665c0a8b84c5bbad2bb7e82db144617d Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 2 Dec 2024 16:21:49 -0700 Subject: [PATCH 15/35] Update aie_kernels/aie2/mul.cc Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- aie_kernels/aie2/mul.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc index 0c4290034c..a028861c20 100755 --- a/aie_kernels/aie2/mul.cc +++ b/aie_kernels/aie2/mul.cc @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -//#define __AIENGINE__ 2 -//#define NOCPP -//#define __AIEARCH__ 20 +// #define __AIENGINE__ 2 +// #define NOCPP +// #define __AIEARCH__ 20 #include #include From 797739387f2580c38f40ff557de8c00e61062f0f Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 11:09:13 -0700 Subject: [PATCH 16/35] Update alt files --- .../basic/passthrough_kernel/aie2_alt.py | 17 ++++++++++++----- .../basic/vector_scalar_mul/aie2_alt.py | 19 +++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py index f41965455c..b2835ff997 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py @@ -16,11 +16,11 @@ import aie.utils.trace as trace_utils -def passthroughKernel(vector_size, trace_size): +def passthroughKernel(dev, vector_size, trace_size): N = vector_size lineWidthInBytes = N // 4 # chop input in 4 sub-tensors - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): # define types vector_ty = np.ndarray[(N,), np.dtype[np.uint8]] @@ -79,13 +79,20 @@ def sequence(inTensor, outTensor, notUsed): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - passthroughKernel(vector_size, trace_size) + passthroughKernel(dev, vector_size, trace_size) print(ctx.module) diff --git a/programming_examples/basic/vector_scalar_mul/aie2_alt.py b/programming_examples/basic/vector_scalar_mul/aie2_alt.py index 335e966745..6c52fc1c21 100644 --- a/programming_examples/basic/vector_scalar_mul/aie2_alt.py +++ b/programming_examples/basic/vector_scalar_mul/aie2_alt.py @@ -16,7 +16,7 @@ import aie.utils.trace as trace_utils -def my_vector_scalar(vector_size, trace_size): +def my_vector_scalar(dev, vector_size, trace_size): N = vector_size N_in_bytes = N * 2 N_div_n = 4 # chop input vector into 4 sub-vectors @@ -26,7 +26,7 @@ def my_vector_scalar(vector_size, trace_size): vectorized = True - @device(AIEDevice.npu1_1col) + @device(dev) def device_body(): tensor_ty = np.ndarray[(N,), np.dtype[np.int16]] tile_ty = np.ndarray[(n,), np.dtype[np.int16]] @@ -97,13 +97,20 @@ def sequence(A, F, C): try: - vector_size = int(sys.argv[1]) + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + vector_size = int(sys.argv[2]) if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError - trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) + trace_size = 0 if (len(sys.argv) != 4) else int(sys.argv[3]) except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - my_vector_scalar(vector_size, trace_size) - print(ctx.module) + my_vector_scalar(dev, vector_size, trace_size) +print(ctx.module) From 9caaef3d58c0cef4d1f6b27591f4a7987af03e6e Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 11:11:03 -0700 Subject: [PATCH 17/35] Try with words --- .../basic/passthrough_kernel/run_strix_makefile.lit | 2 +- programming_examples/lit.cfg.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index fc1d046b74..91437d212e 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -7,4 +7,4 @@ // RUN: cd test_stx // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 -// RUN: %run_on_npu2 make -f %S/Makefile run device=npu2 | FileCheck %s +// RUN: %run_on_npu_two make -f %S/Makefile run device=npu2 | FileCheck %s diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index efa77009a7..b796fff94c 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -46,7 +46,7 @@ llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python")) run_on_npu = "echo" -run_on_npu2 = "echo" +run_on_npu_two = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -146,7 +146,7 @@ f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) if str(m.group(3)) == "npu4": - run_on_npu2 = ( + run_on_npu_two = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) break @@ -157,7 +157,7 @@ print("xrt not found") config.substitutions.append(("%run_on_npu", run_on_npu)) -config.substitutions.append(("%run_on_npu2", run_on_npu2)) +config.substitutions.append(("%run_on_npu_two", run_on_npu_two)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1" From 8a37de38a08c8d03b39439ea14079d797a8002e1 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 11:22:18 -0700 Subject: [PATCH 18/35] Try reordering --- .../basic/passthrough_kernel/run_strix_makefile.lit | 2 +- programming_examples/lit.cfg.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index 91437d212e..6b1aef3a70 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -7,4 +7,4 @@ // RUN: cd test_stx // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 -// RUN: %run_on_npu_two make -f %S/Makefile run device=npu2 | FileCheck %s +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 | FileCheck %s diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index b796fff94c..28a782d51c 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -46,7 +46,7 @@ llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python")) run_on_npu = "echo" -run_on_npu_two = "echo" +run_on_2npu = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -146,7 +146,7 @@ f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) if str(m.group(3)) == "npu4": - run_on_npu_two = ( + run_on_2npu = ( f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) break @@ -157,7 +157,7 @@ print("xrt not found") config.substitutions.append(("%run_on_npu", run_on_npu)) -config.substitutions.append(("%run_on_npu_two", run_on_npu_two)) +config.substitutions.append(("%run_on_2npu", run_on_2npu)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1" From 9d321487447f9a07c79759783b41c8e68b170a10 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 14:20:15 -0700 Subject: [PATCH 19/35] Remove FileCheck and use return --- .../basic/passthrough_kernel/run_makefile.lit | 6 ++---- .../basic/passthrough_kernel/run_makefile_alt.lit | 3 +-- .../basic/passthrough_kernel/run_strix_makefile.lit | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/run_makefile.lit b/programming_examples/basic/passthrough_kernel/run_makefile.lit index 4619c215e6..4c5bc14c4f 100644 --- a/programming_examples/basic/passthrough_kernel/run_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_makefile.lit @@ -7,7 +7,5 @@ // RUN: cd test // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s -// CHECK: Running... -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py diff --git a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit index 4a4a70e117..c37843fa25 100644 --- a/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit +++ b/programming_examples/basic/passthrough_kernel/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run_py \ No newline at end of file diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit index 6b1aef3a70..0901bb542f 100755 --- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit +++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit @@ -7,4 +7,4 @@ // RUN: cd test_stx // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile device=npu2 -// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 | FileCheck %s +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 From 05fa89853babd619651396c8029f78b430fe506a Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 14:37:16 -0700 Subject: [PATCH 20/35] [TEST] break the test --- programming_examples/basic/passthrough_kernel/aie2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index b225295898..ecdfabaab9 100755 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -56,7 +56,7 @@ def core_body(): of_out.release(ObjectFifoPort.Produce, 1) # print(ctx.module.operation.verify()) - + N2 = N // 2 @runtime_sequence(vector_ty, vector_ty, vector_ty) def sequence(inTensor, outTensor, notUsed): if trace_size > 0: @@ -72,14 +72,14 @@ def sequence(inTensor, outTensor, notUsed): metadata=of_in, bd_id=0, mem=inTensor, - sizes=[1, 1, 1, N], + sizes=[1, 1, 1, N2], issue_token=True, ) npu_dma_memcpy_nd( metadata=of_out, bd_id=1, mem=outTensor, - sizes=[1, 1, 1, N], + sizes=[1, 1, 1, N2], ) dma_wait(of_in, of_out) From 3730e7893dd09080a87ff88389e4c32429c012f4 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 14:48:17 -0700 Subject: [PATCH 21/35] test.py return fix --- programming_examples/basic/passthrough_kernel/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index f93ddb3ac7..629a1097c1 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -44,7 +44,7 @@ def main(opts): else: print("\nError count: ", errors) print("\nFailed.\n") - exit(-1) + exit(1) if __name__ == "__main__": From 0f4530750cf030519cd6ea20689867c4250440d7 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 15:10:10 -0700 Subject: [PATCH 22/35] test.py return fix again --- programming_examples/basic/passthrough_kernel/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 629a1097c1..1e06554a0e 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -40,11 +40,11 @@ def main(opts): if not errors: print("\nPASS!\n") - exit(0) + sys.exit(0) else: print("\nError count: ", errors) print("\nFailed.\n") - exit(1) + sys.exit(-1) if __name__ == "__main__": From fb36343e34f7e18942db2bbe7355db663045bc78 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 16:46:49 -0700 Subject: [PATCH 23/35] test.py return 1 --- programming_examples/basic/passthrough_kernel/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 1e06554a0e..0987344b6a 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -44,7 +44,7 @@ def main(opts): else: print("\nError count: ", errors) print("\nFailed.\n") - sys.exit(-1) + sys.exit(1) if __name__ == "__main__": From 8464fde50cb73f10a7db3448ef9e2ee3bc52d77b Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Tue, 3 Dec 2024 22:21:27 -0700 Subject: [PATCH 24/35] Force fail test.py --- programming_examples/basic/passthrough_kernel/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index 0987344b6a..f00623fbbf 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -38,6 +38,7 @@ def main(opts): e = np.equal(input, aie_output) errors = np.size(e) - np.count_nonzero(e) + errors = 1 if not errors: print("\nPASS!\n") sys.exit(0) From 007882c684381431830109d12abcea40ba23a494 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 09:50:50 -0700 Subject: [PATCH 25/35] Test for CI --- programming_examples/basic/passthrough_kernel/test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py index f00623fbbf..0987344b6a 100644 --- a/programming_examples/basic/passthrough_kernel/test.py +++ b/programming_examples/basic/passthrough_kernel/test.py @@ -38,7 +38,6 @@ def main(opts): e = np.equal(input, aie_output) errors = np.size(e) - np.count_nonzero(e) - errors = 1 if not errors: print("\nPASS!\n") sys.exit(0) From 8459aef569bcaf23039b2aa26dbe4a37c564d2e4 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 10:04:02 -0700 Subject: [PATCH 26/35] Break alt --- programming_examples/basic/passthrough_kernel/aie2_alt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py index b2835ff997..cd54b499b4 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py @@ -57,6 +57,7 @@ def core_body(): # print(ctx.module.operation.verify()) + N2 = N // 2 @runtime_sequence(vector_ty, vector_ty, vector_ty) def sequence(inTensor, outTensor, notUsed): if trace_size > 0: @@ -68,10 +69,10 @@ def sequence(inTensor, outTensor, notUsed): offset=N, ) in_task = shim_dma_single_bd_task( - of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True + of_in, inTensor, sizes=[1, 1, 1, N2], issue_token=True ) out_task = shim_dma_single_bd_task( - of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True + of_out, outTensor, sizes=[1, 1, 1, N2], issue_token=True ) dma_start_task(in_task, out_task) From 08ac2d07bc526e63a39fa123f2e191594b061366 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 10:22:05 -0700 Subject: [PATCH 27/35] Cleanup --- aie_kernels/aie2/mul.cc | 4 ---- programming_examples/basic/passthrough_kernel/aie2.py | 6 ++---- programming_examples/basic/passthrough_kernel/aie2_alt.py | 5 ++--- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/aie_kernels/aie2/mul.cc b/aie_kernels/aie2/mul.cc index a028861c20..c5ed109332 100755 --- a/aie_kernels/aie2/mul.cc +++ b/aie_kernels/aie2/mul.cc @@ -8,10 +8,6 @@ // //===----------------------------------------------------------------------===// -// #define __AIENGINE__ 2 -// #define NOCPP -// #define __AIEARCH__ 20 - #include #include #include diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index ecdfabaab9..ff03ab0bd8 100755 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -55,8 +55,6 @@ def core_body(): of_in.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - # print(ctx.module.operation.verify()) - N2 = N // 2 @runtime_sequence(vector_ty, vector_ty, vector_ty) def sequence(inTensor, outTensor, notUsed): if trace_size > 0: @@ -72,14 +70,14 @@ def sequence(inTensor, outTensor, notUsed): metadata=of_in, bd_id=0, mem=inTensor, - sizes=[1, 1, 1, N2], + sizes=[1, 1, 1, N], issue_token=True, ) npu_dma_memcpy_nd( metadata=of_out, bd_id=1, mem=outTensor, - sizes=[1, 1, 1, N2], + sizes=[1, 1, 1, N], ) dma_wait(of_in, of_out) diff --git a/programming_examples/basic/passthrough_kernel/aie2_alt.py b/programming_examples/basic/passthrough_kernel/aie2_alt.py index cd54b499b4..b2835ff997 100644 --- a/programming_examples/basic/passthrough_kernel/aie2_alt.py +++ b/programming_examples/basic/passthrough_kernel/aie2_alt.py @@ -57,7 +57,6 @@ def core_body(): # print(ctx.module.operation.verify()) - N2 = N // 2 @runtime_sequence(vector_ty, vector_ty, vector_ty) def sequence(inTensor, outTensor, notUsed): if trace_size > 0: @@ -69,10 +68,10 @@ def sequence(inTensor, outTensor, notUsed): offset=N, ) in_task = shim_dma_single_bd_task( - of_in, inTensor, sizes=[1, 1, 1, N2], issue_token=True + of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True ) out_task = shim_dma_single_bd_task( - of_out, outTensor, sizes=[1, 1, 1, N2], issue_token=True + of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True ) dma_start_task(in_task, out_task) From 7434ab95b6f0429834005f53da151a2d66247c94 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 10:22:44 -0700 Subject: [PATCH 28/35] vector_scalar_mul stx --- .../basic/vector_scalar_mul/run_strix_makefile.lit | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100755 programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit diff --git a/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/basic/vector_scalar_mul/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 From 9da29ee460beda717924d15b994a3bb01513d546 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 10:26:54 -0700 Subject: [PATCH 29/35] Use returns for lit checks --- .../basic/vector_scalar_mul/run_makefile.lit | 9 ++++----- .../basic/vector_scalar_mul/run_makefile_alt.lit | 5 ++--- .../basic/vector_scalar_mul/run_makefile_chess.lit | 9 ++++----- programming_examples/basic/vector_scalar_mul/test.py | 4 ++-- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile.lit b/programming_examples/basic/vector_scalar_mul/run_makefile.lit index d298884111..244bff012a 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile.lit @@ -7,9 +7,8 @@ // RUN: cd test_peano // RUN: make -f %S/Makefile clean // RUN: env CHESS=false make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py // RUN: make -f %S/Makefile clean -// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace | FileCheck %s -// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py | FileCheck %s -// CHECK: PASS! +// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace +// RUN: env CHESS=false %run_on_npu make -f %S/Makefile trace_py diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit index edfe402ec1..f0b5578ffa 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env CHESS=true use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! - \ No newline at end of file +// RUN: %run_on_npu make -f %S/Makefile run + diff --git a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit index 481b220165..da7102bcfc 100644 --- a/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit +++ b/programming_examples/basic/vector_scalar_mul/run_makefile_chess.lit @@ -7,9 +7,8 @@ // RUN: cd test_chess // RUN: make -f %S/Makefile clean // RUN: env CHESS=true make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// RUN: %run_on_npu make -f %S/Makefile run_py | FileCheck %s +// RUN: %run_on_npu make -f %S/Makefile run +// RUN: %run_on_npu make -f %S/Makefile run_py // RUN: make -f %S/Makefile clean -// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace | FileCheck %s -// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py | FileCheck %s -// CHECK: PASS! +// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace +// RUN: env CHESS=true %run_on_npu make -f %S/Makefile trace_py diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index 9ffa7dab1d..c91b53307f 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -71,11 +71,11 @@ def main(opts): if not errors: print("\nPASS!\n") - exit(0) + sys.exit(0) else: print("\nError count: ", errors) print("\nFailed.\n") - exit(-1) + sys.exit(1) if __name__ == "__main__": From ff3f4844856437e12ddd606569e0f7f26ccf7444 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 11:35:23 -0700 Subject: [PATCH 30/35] Add some vision examples --- .../vision/color_threshold/Makefile | 14 +- .../color_threshold/aie2_colorThreshold.py | 496 ++++++++--------- .../color_threshold/run_strix_makefile.lit | 10 + .../vision/edge_detect/Makefile | 14 +- .../vision/edge_detect/aie2_edgeDetect.py | 513 +++++++++--------- .../vision/edge_detect/run_strix_makefile.lit | 10 + 6 files changed, 554 insertions(+), 503 deletions(-) create mode 100755 programming_examples/vision/color_threshold/run_strix_makefile.lit create mode 100755 programming_examples/vision/edge_detect/run_strix_makefile.lit diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile index a3dfaa8646..84b2d710f3 100644 --- a/programming_examples/vision/color_threshold/Makefile +++ b/programming_examples/vision/color_threshold/Makefile @@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu COLORTHRESHOLD_WIDTH = 1920 COLORTHRESHOLD_HEIGHT = 1080 @@ -33,17 +34,28 @@ mlir: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/%.cc.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/aie2_${COLORTHRESHOLD_WIDTH}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@ + python3 $< ${device} ${COLORTHRESHOLD_WIDTH} ${COLORTHRESHOLD_HEIGHT} > $@ build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/threshold.cc.o mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +endif ${targetname}.exe: ${srcdir}/test.cpp rm -rf _build diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index cbb5c2e631..c8eb7e8657 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -14,258 +14,262 @@ from aie.helpers.util import np_ndarray_type_get_shape from aie.helpers.dialects.ext.scf import _for as range_ -width = 512 -height = 9 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -lineWidth = width -lineWidthChannels = width * 4 # 4 channels -tensorSize = width * height - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def color_threshold(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - - # AIE Core Function declarations - thresholdLine = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) +def color_threshold(dev, width, height): + lineWidth = width + lineWidthChannels = width * 4 # 4 channels + tensorSize = width * height + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + + # AIE Core Function declarations + thresholdLine = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + + # Input RGBA broadcast + memtile for skip + inOOB_L3L2 = object_fifo( + "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty + ) + inOOB_L2L1_0 = object_fifo( + "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty + ) + inOOB_L2L1_1 = object_fifo( + "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty + ) + inOOB_L2L1_2 = object_fifo( + "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty + ) + inOOB_L2L1_3 = object_fifo( + "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty + ) + of_offsets = [ + np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) + ] + object_fifo_link( + inOOB_L3L2, + [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], + [], + of_offsets, + ) + + # Output RGBA + outOOB_L2L3 = object_fifo( + "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty + ) + outOOB_L1L2_0 = object_fifo( + "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty + ) + outOOB_L1L2_1 = object_fifo( + "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty + ) + outOOB_L1L2_2 = object_fifo( + "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty + ) + outOOB_L1L2_3 = object_fifo( + "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty + ) + object_fifo_link( + [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], + outOOB_L2L3, + of_offsets, + [], + ) + + # Runtime parameters + rtpComputeTile2 = buffer( + ComputeTile2, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile2", + use_write_rtp=True, + ) + rtpComputeTile3 = buffer( + ComputeTile3, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile3", + use_write_rtp=True, + ) + rtpComputeTile4 = buffer( + ComputeTile4, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile4", + use_write_rtp=True, + ) + rtpComputeTile5 = buffer( + ComputeTile5, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile5", + use_write_rtp=True, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # AIE-array data movement with object fifos + inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] - object_fifo_link( - inOOB_L3L2, - [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], - [], - of_offsets, - ) + inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty - ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) - object_fifo_link( - [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], - outOOB_L2L3, - of_offsets, - [], - ) + inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Runtime parameters - rtpComputeTile2 = buffer( - ComputeTile2, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile2", - use_write_rtp=True, - ) - rtpComputeTile3 = buffer( - ComputeTile3, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile3", - use_write_rtp=True, + inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence( + np.ndarray[(tensorSize,), np.dtype[np.int8]], + np.ndarray[(32,), np.dtype[np.int32]], # not used + np.ndarray[(tensorSize,), np.dtype[np.int8]], + ) + def sequence(inTensor, notUsed, outTensor): + # thresholdValue, maxValue, thresholdType + rtpComputeTile2[0] = 50 + rtpComputeTile2[1] = 255 + rtpComputeTile2[2] = 0 + + rtpComputeTile3[0] = 50 + rtpComputeTile3[1] = 255 + rtpComputeTile3[2] = 0 + + rtpComputeTile4[0] = 50 + rtpComputeTile4[1] = 255 + rtpComputeTile4[2] = 0 + + rtpComputeTile5[0] = 50 + rtpComputeTile5[1] = 255 + rtpComputeTile5[2] = 0 + + npu_dma_memcpy_nd( + metadata=inOOB_L3L2, + bd_id=1, + mem=inTensor, + sizes=[1, 1, 1, tensorSize], + issue_token=True, ) - rtpComputeTile4 = buffer( - ComputeTile4, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile4", - use_write_rtp=True, + npu_dma_memcpy_nd( + metadata=outOOB_L2L3, + bd_id=0, + mem=outTensor, + sizes=[1, 1, 1, tensorSize], ) - rtpComputeTile5 = buffer( - ComputeTile5, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile5", - use_write_rtp=True, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence( - np.ndarray[(tensorSize,), np.dtype[np.int8]], - np.ndarray[(32,), np.dtype[np.int32]], # not used - np.ndarray[(tensorSize,), np.dtype[np.int8]], - ) - def sequence(inTensor, notUsed, outTensor): - # thresholdValue, maxValue, thresholdType - rtpComputeTile2[0] = 50 - rtpComputeTile2[1] = 255 - rtpComputeTile2[2] = 0 - - rtpComputeTile3[0] = 50 - rtpComputeTile3[1] = 255 - rtpComputeTile3[2] = 0 - - rtpComputeTile4[0] = 50 - rtpComputeTile4[1] = 255 - rtpComputeTile4[2] = 0 - - rtpComputeTile5[0] = 50 - rtpComputeTile5[1] = 255 - rtpComputeTile5[2] = 0 - - npu_dma_memcpy_nd( - metadata=inOOB_L3L2, - bd_id=1, - mem=inTensor, - sizes=[1, 1, 1, tensorSize], - issue_token=True, - ) - npu_dma_memcpy_nd( - metadata=outOOB_L2L3, - bd_id=0, - mem=outTensor, - sizes=[1, 1, 1, tensorSize], - ) - dma_wait(inOOB_L3L2, outOOB_L2L3) - - # print(ctx.module.operation.verify()) + dma_wait(inOOB_L3L2, outOOB_L2L3) + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 512 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 9 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + #print(ctx.module.operation.verify()) + color_threshold(dev, width, height) print(ctx.module) - -color_threshold() diff --git a/programming_examples/vision/color_threshold/run_strix_makefile.lit b/programming_examples/vision/color_threshold/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/vision/color_threshold/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile index e1ed21e0ae..2f6159bd3d 100755 --- a/programming_examples/vision/edge_detect/Makefile +++ b/programming_examples/vision/edge_detect/Makefile @@ -12,6 +12,7 @@ include ${srcdir}/../../makefile-common VPATH := ${srcdir}/../../../aie_kernels/aie2 +device = npu EDGEDETECT_WIDTH = 1920 EDGEDETECT_HEIGHT = 1080 @@ -36,7 +37,13 @@ mlir: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/%.cc.o: %.cc mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(device),npu2) + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2P_FLAGS} -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o mkdir -p ${@D} @@ -44,13 +51,18 @@ build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.c build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: ${srcdir}/${aie_py_src} mkdir -p ${@D} - python3 $< ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@ + python3 $< ${device} ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a mkdir -p ${@D} +ifeq ($(device),npu) cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ --no-xchesscc --no-xbridge \ --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +else + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --alloc-scheme=basic-sequential \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) +endif ${targetname}.exe: ${srcdir}/test.cpp rm -rf _build diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 59d7c030f2..23565c8b47 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -12,278 +12,281 @@ from aie.helpers.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx -width = 64 -height = 36 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -heightMinus1 = height - 1 -lineWidth = width -lineWidthInBytes = width * 4 -tensorSize = width * height * 4 # 4 channels - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def edge_detect(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] - - tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] - tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] - - # AIE Core Function declarations - rgba2gray_line = external_func( - "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] - ) - filter2d_line = external_func( - "filter2dLine", - inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], - ) - threshold_line = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - gray2rgba_line = external_func( - "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] - ) - add_weighted_line = external_func( - "addWeightedLine", - inputs=[ - line_bytes_ty, - line_bytes_ty, - line_bytes_ty, - np.int32, - np.int16, - np.int16, - np.int8, - ], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - # Input - inOF_L3L2 = object_fifo( - "inOF_L3L2", - ShimTile, - [ComputeTile2, MemTile], - [2, 2, 7], +def edge_detect(dev, width, height): + heightMinus1 = height - 1 + lineWidth = width + lineWidthInBytes = width * 4 + tensorSize = width * height * 4 # 4 channels + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] + + tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] + tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] + + # AIE Core Function declarations + rgba2gray_line = external_func( + "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] + ) + filter2d_line = external_func( + "filter2dLine", + inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], + ) + threshold_line = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + gray2rgba_line = external_func( + "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] + ) + add_weighted_line = external_func( + "addWeightedLine", + inputs=[ line_bytes_ty, - ) - inOF_L2L1 = object_fifo( - "inOF_L2L1", - MemTile, - ComputeTile5, - 7, - line_bytes_ty, - ) - object_fifo_link(inOF_L3L2, inOF_L2L1) - - # Output - outOF_L2L3 = object_fifo( - "outOF_L2L3", - MemTile, - ShimTile, - 2, line_bytes_ty, - ) - outOF_L1L2 = object_fifo( - "outOF_L1L2", - ComputeTile5, - MemTile, - 2, line_bytes_ty, + np.int32, + np.int16, + np.int16, + np.int8, + ], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + # Input + inOF_L3L2 = object_fifo( + "inOF_L3L2", + ShimTile, + [ComputeTile2, MemTile], + [2, 2, 7], + line_bytes_ty, + ) + inOF_L2L1 = object_fifo( + "inOF_L2L1", + MemTile, + ComputeTile5, + 7, + line_bytes_ty, + ) + object_fifo_link(inOF_L3L2, inOF_L2L1) + + # Output + outOF_L2L3 = object_fifo( + "outOF_L2L3", + MemTile, + ShimTile, + 2, + line_bytes_ty, + ) + outOF_L1L2 = object_fifo( + "outOF_L1L2", + ComputeTile5, + MemTile, + 2, + line_bytes_ty, + ) + object_fifo_link(outOF_L1L2, outOF_L2L3) + + # Intermediate + OF_2to3 = object_fifo( + "OF_2to3", + ComputeTile2, + ComputeTile3, + 4, + line_ty, + ) + OF_3to4 = object_fifo( + "OF_3to4", + ComputeTile3, + ComputeTile4, + 2, + line_ty, + ) + OF_4to5 = object_fifo( + "OF_4to5", + ComputeTile4, + ComputeTile5, + 2, + line_ty, + ) + OF_5to5 = object_fifo( + "OF_5to5", + ComputeTile5, + ComputeTile5, + 1, + line_bytes_ty, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "rgba2gray.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) + + rgba2gray_line(elem_in, elem_out, lineWidth) + + inOF_L3L2.release(ObjectFifoPort.Consume, 1) + OF_2to3.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "filter2d.cc.o") + def core_body(): + v0 = 0 + v1 = 4096 + v_minus4 = -16384 + initial_value = np.array( + [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 ) - object_fifo_link(outOF_L1L2, outOF_L2L3) - - # Intermediate - OF_2to3 = object_fifo( - "OF_2to3", - ComputeTile2, + kernel = buffer( ComputeTile3, - 4, - line_ty, - ) - OF_3to4 = object_fifo( - "OF_3to4", - ComputeTile3, - ComputeTile4, - 2, - line_ty, - ) - OF_4to5 = object_fifo( - "OF_4to5", - ComputeTile4, - ComputeTile5, - 2, - line_ty, + np.ndarray[(3, 3), np.dtype[np.int16]], + "kernel", + initial_value=initial_value, ) - OF_5to5 = object_fifo( - "OF_5to5", - ComputeTile5, - ComputeTile5, - 1, - line_bytes_ty, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "rgba2gray.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) - - rgba2gray_line(elem_in, elem_out, lineWidth) - inOF_L3L2.release(ObjectFifoPort.Consume, 1) - OF_2to3.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "filter2d.cc.o") - def core_body(): - v0 = 0 - v1 = 4096 - v_minus4 = -16384 - initial_value = np.array( - [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 - ) - kernel = buffer( - ComputeTile3, - np.ndarray[(3, 3), np.dtype[np.int16]], - "kernel", - initial_value=initial_value, + for _ in range_(sys.maxsize): + # Preamble : Top Border + elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_pre[0], + elems_in_pre[0], + elems_in_pre[1], + elem_pre_out, + lineWidth, + kernel, ) + OF_3to4.release(ObjectFifoPort.Produce, 1) - for _ in range_(sys.maxsize): - # Preamble : Top Border - elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + # Steady State : Middle + for _ in range_(1, heightMinus1): + elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) + elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) filter2d_line( - elems_in_pre[0], - elems_in_pre[0], - elems_in_pre[1], - elem_pre_out, + elems_in[0], + elems_in[1], + elems_in[2], + elem_out, lineWidth, kernel, ) + OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - # Steady State : Middle - for _ in range_(1, heightMinus1): - elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) - elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in[0], - elems_in[1], - elems_in[2], - elem_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 1) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Postamble : Bottom Border - elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in_post[0], - elems_in_post[1], - elems_in_post[1], - elem_post_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 2) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - v_thr = 10 - v_max = 255 - v_typ = 0 - - for _ in range_(sys.maxsize): - elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) - - threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) - - OF_3to4.release(ObjectFifoPort.Consume, 1) - OF_4to5.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) - - gray2rgba_line(elem_in, elem_out, lineWidth) - - OF_4to5.release(ObjectFifoPort.Consume, 1) - OF_5to5.release(ObjectFifoPort.Produce, 1) - - elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) - elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) - elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) - - alpha = 16384 - beta = 16384 - gamma = 0 - - add_weighted_line( - elem_in1, - elem_in2, - elem_out2, - lineWidthInBytes, - alpha, - beta, - gamma, - ) - - OF_5to5.release(ObjectFifoPort.Consume, 1) - inOF_L2L1.release(ObjectFifoPort.Consume, 1) - outOF_L1L2.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) - def sequence(I, B, O): - npu_dma_memcpy_nd( - metadata=inOF_L3L2, - bd_id=1, - mem=I, - sizes=[1, 1, 1, tensorSize], + # Postamble : Bottom Border + elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_post[0], + elems_in_post[1], + elems_in_post[1], + elem_post_out, + lineWidth, + kernel, ) - npu_dma_memcpy_nd( - metadata=outOF_L2L3, - bd_id=0, - mem=O, - sizes=[1, 1, 1, tensorSize], + OF_2to3.release(ObjectFifoPort.Consume, 2) + OF_3to4.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + v_thr = 10 + v_max = 255 + v_typ = 0 + + for _ in range_(sys.maxsize): + elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) + + threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) + + OF_3to4.release(ObjectFifoPort.Consume, 1) + OF_4to5.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) + + gray2rgba_line(elem_in, elem_out, lineWidth) + + OF_4to5.release(ObjectFifoPort.Consume, 1) + OF_5to5.release(ObjectFifoPort.Produce, 1) + + elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) + elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) + + alpha = 16384 + beta = 16384 + gamma = 0 + + add_weighted_line( + elem_in1, + elem_in2, + elem_out2, + lineWidthInBytes, + alpha, + beta, + gamma, ) - # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all - dma_wait(outOF_L2L3) - # print(ctx.module.operation.verify()) + OF_5to5.release(ObjectFifoPort.Consume, 1) + inOF_L2L1.release(ObjectFifoPort.Consume, 1) + outOF_L1L2.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) + def sequence(I, B, O): + npu_dma_memcpy_nd( + metadata=inOF_L3L2, + bd_id=1, + mem=I, + sizes=[1, 1, 1, tensorSize], + ) + npu_dma_memcpy_nd( + metadata=outOF_L2L3, + bd_id=0, + mem=O, + sizes=[1, 1, 1, tensorSize], + ) + # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all + dma_wait(outOF_L2L3) + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 36 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 64 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + #print(ctx.module.operation.verify()) + edge_detect(dev, width, height) print(ctx.module) - - -edge_detect() diff --git a/programming_examples/vision/edge_detect/run_strix_makefile.lit b/programming_examples/vision/edge_detect/run_strix_makefile.lit new file mode 100755 index 0000000000..0901bb542f --- /dev/null +++ b/programming_examples/vision/edge_detect/run_strix_makefile.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: mkdir -p test_stx +// RUN: cd test_stx +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile device=npu2 +// RUN: %run_on_2npu make -f %S/Makefile run device=npu2 From 5b5ed073482fa599b1fb7e69fc0f4aa22159b838 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 12:49:34 -0700 Subject: [PATCH 31/35] Update alt to accept device --- .../aie2_colorThreshold_alt.py | 488 ++++++++--------- .../vision/edge_detect/aie2_edgeDetect_alt.py | 509 +++++++++--------- 2 files changed, 502 insertions(+), 495 deletions(-) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py index fd36516f3b..b676b9168e 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py @@ -14,256 +14,260 @@ from aie.helpers.util import np_ndarray_type_get_shape from aie.helpers.dialects.ext.scf import _for as range_ -width = 512 -height = 9 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -lineWidth = width -lineWidthChannels = width * 4 # 4 channels -tensorSize = width * height - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def color_threshold(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - - # AIE Core Function declarations - thresholdLine = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) +def color_threshold(dev, width, height): + lineWidth = width + lineWidthChannels = width * 4 # 4 channels + tensorSize = width * height + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_channels_ty = np.ndarray[(lineWidthChannels,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + + # AIE Core Function declarations + thresholdLine = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + + # Input RGBA broadcast + memtile for skip + inOOB_L3L2 = object_fifo( + "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty + ) + inOOB_L2L1_0 = object_fifo( + "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty + ) + inOOB_L2L1_1 = object_fifo( + "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty + ) + inOOB_L2L1_2 = object_fifo( + "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty + ) + inOOB_L2L1_3 = object_fifo( + "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty + ) + of_offsets = [ + np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) + ] + object_fifo_link( + inOOB_L3L2, + [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], + [], + of_offsets, + ) + + # Output RGBA + outOOB_L2L3 = object_fifo( + "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty + ) + outOOB_L1L2_0 = object_fifo( + "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty + ) + outOOB_L1L2_1 = object_fifo( + "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty + ) + outOOB_L1L2_2 = object_fifo( + "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty + ) + outOOB_L1L2_3 = object_fifo( + "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty + ) + object_fifo_link( + [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], + outOOB_L2L3, + of_offsets, + [], + ) + + # Runtime parameters + rtpComputeTile2 = buffer( + ComputeTile2, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile2", + use_write_rtp=True, + ) + rtpComputeTile3 = buffer( + ComputeTile3, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile3", + use_write_rtp=True, + ) + rtpComputeTile4 = buffer( + ComputeTile4, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile4", + use_write_rtp=True, + ) + rtpComputeTile5 = buffer( + ComputeTile5, + np.ndarray[(16,), np.dtype[np.int32]], + "rtpComputeTile5", + use_write_rtp=True, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) + inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # AIE-array data movement with object fifos + inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] - object_fifo_link( - inOOB_L3L2, - [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], - [], - of_offsets, - ) + inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "threshold.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) + elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) + + # RTPs written from the instruction stream must be read right before the kernel + # after the ObjectFIFO acquires + thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) + maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) + thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) + thresholdLine( + elemIn, + elemOut, + lineWidth, + thresholdValue, + maxValue, + thresholdType, + ) - # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty + inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) + outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) + + # To/from AIE-array data movement + @runtime_sequence( + np.ndarray[(tensorSize,), np.dtype[np.int8]], + np.ndarray[(32,), np.dtype[np.int32]], # not used + np.ndarray[(tensorSize,), np.dtype[np.int8]], + ) + def sequence(inTensor, notUsed, outTensor): + # thresholdValue, maxValue, thresholdType + rtpComputeTile2[0] = 50 + rtpComputeTile2[1] = 255 + rtpComputeTile2[2] = 0 + + rtpComputeTile3[0] = 50 + rtpComputeTile3[1] = 255 + rtpComputeTile3[2] = 0 + + rtpComputeTile4[0] = 50 + rtpComputeTile4[1] = 255 + rtpComputeTile4[2] = 0 + + rtpComputeTile5[0] = 50 + rtpComputeTile5[1] = 255 + rtpComputeTile5[2] = 0 + + in_task = shim_dma_single_bd_task( + inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) - object_fifo_link( - [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], + out_task = shim_dma_single_bd_task( outOOB_L2L3, - of_offsets, - [], + outTensor, + sizes=[1, 1, 1, tensorSize], + issue_token=True, ) - # Runtime parameters - rtpComputeTile2 = buffer( - ComputeTile2, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile2", - use_write_rtp=True, - ) - rtpComputeTile3 = buffer( - ComputeTile3, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile3", - use_write_rtp=True, - ) - rtpComputeTile4 = buffer( - ComputeTile4, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile4", - use_write_rtp=True, - ) - rtpComputeTile5 = buffer( - ComputeTile5, - np.ndarray[(16,), np.dtype[np.int32]], - "rtpComputeTile5", - use_write_rtp=True, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_0.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_0.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile2[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile2[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile2[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_0.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1) - - # Compute tile 3 - @core(ComputeTile3, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_1.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_1.acquire(ObjectFifoPort.Produce, 1) - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile3[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile3[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile3[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_1.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1) - - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_2.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_2.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile4[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile4[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile4[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_2.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "threshold.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elemIn = inOOB_L2L1_3.acquire(ObjectFifoPort.Consume, 1) - elemOut = outOOB_L1L2_3.acquire(ObjectFifoPort.Produce, 1) - - # RTPs written from the instruction stream must be read right before the kernel - # after the ObjectFIFO acquires - thresholdValue = arith.trunci(T.i16(), rtpComputeTile5[0]) - maxValue = arith.trunci(T.i16(), rtpComputeTile5[1]) - thresholdType = arith.trunci(T.i8(), rtpComputeTile5[2]) - thresholdLine( - elemIn, - elemOut, - lineWidth, - thresholdValue, - maxValue, - thresholdType, - ) - - inOOB_L2L1_3.release(ObjectFifoPort.Consume, 1) - outOOB_L1L2_3.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence( - np.ndarray[(tensorSize,), np.dtype[np.int8]], - np.ndarray[(32,), np.dtype[np.int32]], # not used - np.ndarray[(tensorSize,), np.dtype[np.int8]], - ) - def sequence(inTensor, notUsed, outTensor): - # thresholdValue, maxValue, thresholdType - rtpComputeTile2[0] = 50 - rtpComputeTile2[1] = 255 - rtpComputeTile2[2] = 0 - - rtpComputeTile3[0] = 50 - rtpComputeTile3[1] = 255 - rtpComputeTile3[2] = 0 - - rtpComputeTile4[0] = 50 - rtpComputeTile4[1] = 255 - rtpComputeTile4[2] = 0 - - rtpComputeTile5[0] = 50 - rtpComputeTile5[1] = 255 - rtpComputeTile5[2] = 0 - - in_task = shim_dma_single_bd_task( - inOOB_L3L2, inTensor, sizes=[1, 1, 1, tensorSize], issue_token=True - ) - out_task = shim_dma_single_bd_task( - outOOB_L2L3, - outTensor, - sizes=[1, 1, 1, tensorSize], - issue_token=True, - ) - - dma_start_task(in_task, out_task) - dma_await_task(in_task, out_task) - - # print(ctx.module.operation.verify()) + dma_start_task(in_task, out_task) + dma_await_task(in_task, out_task) + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 512 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 9 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + #print(ctx.module.operation.verify()) + color_threshold(dev, width, height) print(ctx.module) - -color_threshold() diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py index bbbdc586b6..75b22602be 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py @@ -12,277 +12,280 @@ from aie.helpers.dialects.ext.scf import _for as range_ from aie.extras.context import mlir_mod_ctx -width = 64 -height = 36 -if len(sys.argv) == 3: - width = int(sys.argv[1]) - height = int(sys.argv[2]) -heightMinus1 = height - 1 -lineWidth = width -lineWidthInBytes = width * 4 -tensorSize = width * height * 4 # 4 channels - -enableTrace = False -traceSizeInBytes = 8192 -traceSizeInInt32s = traceSizeInBytes // 4 - - -def edge_detect(): - with mlir_mod_ctx() as ctx: - - @device(AIEDevice.npu1_1col) - def device_body(): - line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] - line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] - tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] - - tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] - tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] - - # AIE Core Function declarations - rgba2gray_line = external_func( - "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] - ) - filter2d_line = external_func( - "filter2dLine", - inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], - ) - threshold_line = external_func( - "thresholdLine", - inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], - ) - gray2rgba_line = external_func( - "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] - ) - add_weighted_line = external_func( - "addWeightedLine", - inputs=[ - line_bytes_ty, - line_bytes_ty, - line_bytes_ty, - np.int32, - np.int16, - np.int16, - np.int8, - ], - ) - - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) - ComputeTile3 = tile(0, 3) - ComputeTile4 = tile(0, 4) - ComputeTile5 = tile(0, 5) - - # AIE-array data movement with object fifos - # Input - inOF_L3L2 = object_fifo( - "inOF_L3L2", - ShimTile, - [ComputeTile2, MemTile], - [2, 2, 7], +def edge_detect(dev, width, height): + heightMinus1 = height - 1 + lineWidth = width + lineWidthInBytes = width * 4 + tensorSize = width * height * 4 # 4 channels + + enableTrace = False + traceSizeInBytes = 8192 + traceSizeInInt32s = traceSizeInBytes // 4 + + @device(dev) + def device_body(): + line_bytes_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]] + line_ty = np.ndarray[(lineWidth,), np.dtype[np.uint8]] + tensor_3x3_ty = np.ndarray[(3, 3), np.dtype[np.int16]] + + tensor_ty = np.ndarray[(tensorSize,), np.dtype[np.int8]] + tensor_16x16_ty = np.ndarray[(16, 16), np.dtype[np.int32]] + + # AIE Core Function declarations + rgba2gray_line = external_func( + "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32] + ) + filter2d_line = external_func( + "filter2dLine", + inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty], + ) + threshold_line = external_func( + "thresholdLine", + inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8], + ) + gray2rgba_line = external_func( + "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32] + ) + add_weighted_line = external_func( + "addWeightedLine", + inputs=[ line_bytes_ty, - ) - inOF_L2L1 = object_fifo( - "inOF_L2L1", - MemTile, - ComputeTile5, - 7, line_bytes_ty, - ) - object_fifo_link(inOF_L3L2, inOF_L2L1) - - # Output - outOF_L2L3 = object_fifo( - "outOF_L2L3", - MemTile, - ShimTile, - 2, - line_bytes_ty, - ) - outOF_L1L2 = object_fifo( - "outOF_L1L2", - ComputeTile5, - MemTile, - 2, line_bytes_ty, + np.int32, + np.int16, + np.int16, + np.int8, + ], + ) + + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) + ComputeTile3 = tile(0, 3) + ComputeTile4 = tile(0, 4) + ComputeTile5 = tile(0, 5) + + # AIE-array data movement with object fifos + # Input + inOF_L3L2 = object_fifo( + "inOF_L3L2", + ShimTile, + [ComputeTile2, MemTile], + [2, 2, 7], + line_bytes_ty, + ) + inOF_L2L1 = object_fifo( + "inOF_L2L1", + MemTile, + ComputeTile5, + 7, + line_bytes_ty, + ) + object_fifo_link(inOF_L3L2, inOF_L2L1) + + # Output + outOF_L2L3 = object_fifo( + "outOF_L2L3", + MemTile, + ShimTile, + 2, + line_bytes_ty, + ) + outOF_L1L2 = object_fifo( + "outOF_L1L2", + ComputeTile5, + MemTile, + 2, + line_bytes_ty, + ) + object_fifo_link(outOF_L1L2, outOF_L2L3) + + # Intermediate + OF_2to3 = object_fifo( + "OF_2to3", + ComputeTile2, + ComputeTile3, + 4, + line_ty, + ) + OF_3to4 = object_fifo( + "OF_3to4", + ComputeTile3, + ComputeTile4, + 2, + line_ty, + ) + OF_4to5 = object_fifo( + "OF_4to5", + ComputeTile4, + ComputeTile5, + 2, + line_ty, + ) + OF_5to5 = object_fifo( + "OF_5to5", + ComputeTile5, + ComputeTile5, + 1, + line_bytes_ty, + ) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "rgba2gray.cc.o") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) + + rgba2gray_line(elem_in, elem_out, lineWidth) + + inOF_L3L2.release(ObjectFifoPort.Consume, 1) + OF_2to3.release(ObjectFifoPort.Produce, 1) + + # Compute tile 3 + @core(ComputeTile3, "filter2d.cc.o") + def core_body(): + v0 = 0 + v1 = 4096 + v_minus4 = -16384 + initial_value = np.array( + [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 ) - object_fifo_link(outOF_L1L2, outOF_L2L3) - - # Intermediate - OF_2to3 = object_fifo( - "OF_2to3", - ComputeTile2, - ComputeTile3, - 4, - line_ty, - ) - OF_3to4 = object_fifo( - "OF_3to4", + kernel = buffer( ComputeTile3, - ComputeTile4, - 2, - line_ty, + np.ndarray[(3, 3), np.dtype[np.int16]], + "kernel", + initial_value=initial_value, ) - OF_4to5 = object_fifo( - "OF_4to5", - ComputeTile4, - ComputeTile5, - 2, - line_ty, - ) - OF_5to5 = object_fifo( - "OF_5to5", - ComputeTile5, - ComputeTile5, - 1, - line_bytes_ty, - ) - - # Set up compute tiles - - # Compute tile 2 - @core(ComputeTile2, "rgba2gray.cc.o") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_2to3.acquire(ObjectFifoPort.Produce, 1) - - rgba2gray_line(elem_in, elem_out, lineWidth) - - inOF_L3L2.release(ObjectFifoPort.Consume, 1) - OF_2to3.release(ObjectFifoPort.Produce, 1) - # Compute tile 3 - @core(ComputeTile3, "filter2d.cc.o") - def core_body(): - v0 = 0 - v1 = 4096 - v_minus4 = -16384 - initial_value = np.array( - [[v0, v1, v0], [v1, v_minus4, v1], [v0, v1, v0]], dtype=np.int16 + for _ in range_(sys.maxsize): + # Preamble : Top Border + elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_pre[0], + elems_in_pre[0], + elems_in_pre[1], + elem_pre_out, + lineWidth, + kernel, ) - kernel = buffer( - ComputeTile3, - np.ndarray[(3, 3), np.dtype[np.int16]], - "kernel", - initial_value=initial_value, - ) - - for _ in range_(sys.maxsize): - # Preamble : Top Border - elems_in_pre = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_pre_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in_pre[0], - elems_in_pre[0], - elems_in_pre[1], - elem_pre_out, - lineWidth, - kernel, - ) - OF_3to4.release(ObjectFifoPort.Produce, 1) + OF_3to4.release(ObjectFifoPort.Produce, 1) - # Steady State : Middle - for _ in range_(1, heightMinus1): - elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) - elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) - filter2d_line( - elems_in[0], - elems_in[1], - elems_in[2], - elem_out, - lineWidth, - kernel, - ) - OF_2to3.release(ObjectFifoPort.Consume, 1) - OF_3to4.release(ObjectFifoPort.Produce, 1) - - # Postamble : Bottom Border - elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) - elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + # Steady State : Middle + for _ in range_(1, heightMinus1): + elems_in = OF_2to3.acquire(ObjectFifoPort.Consume, 3) + elem_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) filter2d_line( - elems_in_post[0], - elems_in_post[1], - elems_in_post[1], - elem_post_out, + elems_in[0], + elems_in[1], + elems_in[2], + elem_out, lineWidth, kernel, ) - OF_2to3.release(ObjectFifoPort.Consume, 2) + OF_2to3.release(ObjectFifoPort.Consume, 1) OF_3to4.release(ObjectFifoPort.Produce, 1) - # Compute tile 4 - @core(ComputeTile4, "threshold.cc.o") - def core_body(): - v_thr = 10 - v_max = 255 - v_typ = 0 - - for _ in range_(sys.maxsize): - elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) - - threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) - - OF_3to4.release(ObjectFifoPort.Consume, 1) - OF_4to5.release(ObjectFifoPort.Produce, 1) - - # Compute tile 5 - @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") - def core_body(): - for _ in range_(sys.maxsize): - elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) - elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) - - gray2rgba_line(elem_in, elem_out, lineWidth) - - OF_4to5.release(ObjectFifoPort.Consume, 1) - OF_5to5.release(ObjectFifoPort.Produce, 1) - - elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) - elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) - elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) - - alpha = 16384 - beta = 16384 - gamma = 0 - - add_weighted_line( - elem_in1, - elem_in2, - elem_out2, - lineWidthInBytes, - alpha, - beta, - gamma, - ) - - OF_5to5.release(ObjectFifoPort.Consume, 1) - inOF_L2L1.release(ObjectFifoPort.Consume, 1) - outOF_L1L2.release(ObjectFifoPort.Produce, 1) - - # To/from AIE-array data movement - @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) - def sequence(I, B, O): - in_task = shim_dma_single_bd_task( - inOF_L3L2, I, sizes=[1, 1, 1, tensorSize] + # Postamble : Bottom Border + elems_in_post = OF_2to3.acquire(ObjectFifoPort.Consume, 2) + elem_post_out = OF_3to4.acquire(ObjectFifoPort.Produce, 1) + filter2d_line( + elems_in_post[0], + elems_in_post[1], + elems_in_post[1], + elem_post_out, + lineWidth, + kernel, ) - out_task = shim_dma_single_bd_task( - outOF_L2L3, - O, - sizes=[1, 1, 1, tensorSize], - issue_token=True, + OF_2to3.release(ObjectFifoPort.Consume, 2) + OF_3to4.release(ObjectFifoPort.Produce, 1) + + # Compute tile 4 + @core(ComputeTile4, "threshold.cc.o") + def core_body(): + v_thr = 10 + v_max = 255 + v_typ = 0 + + for _ in range_(sys.maxsize): + elem_in = OF_3to4.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_4to5.acquire(ObjectFifoPort.Produce, 1) + + threshold_line(elem_in, elem_out, lineWidth, v_thr, v_max, v_typ) + + OF_3to4.release(ObjectFifoPort.Consume, 1) + OF_4to5.release(ObjectFifoPort.Produce, 1) + + # Compute tile 5 + @core(ComputeTile5, "combined_gray2rgba_addWeighted.a") + def core_body(): + for _ in range_(sys.maxsize): + elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1) + elem_out = OF_5to5.acquire(ObjectFifoPort.Produce, 1) + + gray2rgba_line(elem_in, elem_out, lineWidth) + + OF_4to5.release(ObjectFifoPort.Consume, 1) + OF_5to5.release(ObjectFifoPort.Produce, 1) + + elem_in1 = OF_5to5.acquire(ObjectFifoPort.Consume, 1) + elem_in2 = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1) + elem_out2 = outOF_L1L2.acquire(ObjectFifoPort.Produce, 1) + + alpha = 16384 + beta = 16384 + gamma = 0 + + add_weighted_line( + elem_in1, + elem_in2, + elem_out2, + lineWidthInBytes, + alpha, + beta, + gamma, ) - dma_start_task(in_task, out_task) - dma_await_task(out_task) - dma_free_task(in_task) - - # print(ctx.module.operation.verify()) - print(ctx.module) + OF_5to5.release(ObjectFifoPort.Consume, 1) + inOF_L2L1.release(ObjectFifoPort.Consume, 1) + outOF_L1L2.release(ObjectFifoPort.Produce, 1) + # To/from AIE-array data movement + @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) + def sequence(I, B, O): + in_task = shim_dma_single_bd_task( + inOF_L3L2, I, sizes=[1, 1, 1, tensorSize] + ) + out_task = shim_dma_single_bd_task( + outOF_L2L3, + O, + sizes=[1, 1, 1, tensorSize], + issue_token=True, + ) -edge_detect() + dma_start_task(in_task, out_task) + dma_await_task(out_task) + dma_free_task(in_task) + +try: + device_name = str(sys.argv[1]) + if device_name == "npu": + dev = AIEDevice.npu1_1col + elif device_name == "npu2": + dev = AIEDevice.npu2 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + width = 36 if (len(sys.argv) != 4) else int(sys.argv[2]) + height = 64 if (len(sys.argv) != 4) else int(sys.argv[3]) +except ValueError: + print("Argument has inappropriate value") +with mlir_mod_ctx() as ctx: + #print(ctx.module.operation.verify()) + edge_detect(dev, width, height) + print(ctx.module) From 75ef8d72141e1237f2d86c1b1d210436f6c8bbb1 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 12:53:05 -0700 Subject: [PATCH 32/35] Vision tests use return not FileCheck --- programming_examples/vision/color_threshold/run_makefile.lit | 3 +-- .../vision/color_threshold/run_makefile_alt.lit | 3 +-- programming_examples/vision/edge_detect/run_makefile.lit | 4 +--- programming_examples/vision/edge_detect/run_makefile_alt.lit | 3 +-- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/programming_examples/vision/color_threshold/run_makefile.lit b/programming_examples/vision/color_threshold/run_makefile.lit index c6e18a3da4..40fc6f201d 100644 --- a/programming_examples/vision/color_threshold/run_makefile.lit +++ b/programming_examples/vision/color_threshold/run_makefile.lit @@ -5,6 +5,5 @@ // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile - // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s - // CHECK: PASS! + // RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/color_threshold/run_makefile_alt.lit b/programming_examples/vision/color_threshold/run_makefile_alt.lit index 9f5617f16c..19bd34a2d0 100644 --- a/programming_examples/vision/color_threshold/run_makefile_alt.lit +++ b/programming_examples/vision/color_threshold/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/edge_detect/run_makefile.lit b/programming_examples/vision/edge_detect/run_makefile.lit index c6e18a3da4..2368db78ff 100644 --- a/programming_examples/vision/edge_detect/run_makefile.lit +++ b/programming_examples/vision/edge_detect/run_makefile.lit @@ -5,6 +5,4 @@ // // RUN: make -f %S/Makefile clean // RUN: make -f %S/Makefile - // RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s - // CHECK: PASS! - \ No newline at end of file + // RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file diff --git a/programming_examples/vision/edge_detect/run_makefile_alt.lit b/programming_examples/vision/edge_detect/run_makefile_alt.lit index 9f5617f16c..19bd34a2d0 100644 --- a/programming_examples/vision/edge_detect/run_makefile_alt.lit +++ b/programming_examples/vision/edge_detect/run_makefile_alt.lit @@ -7,6 +7,5 @@ // RUN: cd test_alt // RUN: make -f %S/Makefile clean // RUN: env use_alt=1 make -f %S/Makefile -// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s -// CHECK: PASS! +// RUN: %run_on_npu make -f %S/Makefile run \ No newline at end of file From cbf116eb794b6493d3f693d086a97f3597e156a9 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 12:55:06 -0700 Subject: [PATCH 33/35] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../color_threshold/aie2_colorThreshold.py | 47 +++++-------------- .../vision/edge_detect/aie2_edgeDetect.py | 3 +- .../vision/edge_detect/aie2_edgeDetect_alt.py | 7 ++- 3 files changed, 18 insertions(+), 39 deletions(-) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index c8eb7e8657..277221f309 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -46,24 +46,12 @@ def device_body(): # AIE-array data movement with object fifos # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] + inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty) + inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty) + inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty) + inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty) + inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty) + of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)] object_fifo_link( inOOB_L3L2, [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], @@ -72,21 +60,11 @@ def device_body(): ) # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty - ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) + outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty) + outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty) + outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty) + outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty) + outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty) object_fifo_link( [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], outOOB_L2L3, @@ -256,6 +234,7 @@ def sequence(inTensor, notUsed, outTensor): ) dma_wait(inOOB_L3L2, outOOB_L2L3) + try: device_name = str(sys.argv[1]) if device_name == "npu": @@ -269,7 +248,7 @@ def sequence(inTensor, notUsed, outTensor): except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - #print(ctx.module.operation.verify()) + # print(ctx.module.operation.verify()) color_threshold(dev, width, height) print(ctx.module) diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py index 23565c8b47..4efc78dbf7 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py @@ -274,6 +274,7 @@ def sequence(I, B, O): # outOF_L2L3 will only complete after inOF_L3L2 completes, so we just wait on outOF_L2L3 instead of all dma_wait(outOF_L2L3) + try: device_name = str(sys.argv[1]) if device_name == "npu": @@ -287,6 +288,6 @@ def sequence(I, B, O): except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - #print(ctx.module.operation.verify()) + # print(ctx.module.operation.verify()) edge_detect(dev, width, height) print(ctx.module) diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py index 75b22602be..75a34e5533 100644 --- a/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py +++ b/programming_examples/vision/edge_detect/aie2_edgeDetect_alt.py @@ -259,9 +259,7 @@ def core_body(): # To/from AIE-array data movement @runtime_sequence(tensor_ty, tensor_16x16_ty, tensor_ty) def sequence(I, B, O): - in_task = shim_dma_single_bd_task( - inOF_L3L2, I, sizes=[1, 1, 1, tensorSize] - ) + in_task = shim_dma_single_bd_task(inOF_L3L2, I, sizes=[1, 1, 1, tensorSize]) out_task = shim_dma_single_bd_task( outOF_L2L3, O, @@ -273,6 +271,7 @@ def sequence(I, B, O): dma_await_task(out_task) dma_free_task(in_task) + try: device_name = str(sys.argv[1]) if device_name == "npu": @@ -286,6 +285,6 @@ def sequence(I, B, O): except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - #print(ctx.module.operation.verify()) + # print(ctx.module.operation.verify()) edge_detect(dev, width, height) print(ctx.module) From d8982633409c52be758a0145b9480bca1fde48ed Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 12:55:55 -0700 Subject: [PATCH 34/35] Apply code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../vision/color_threshold/aie2_colorThreshold.py | 1 - 1 file changed, 1 deletion(-) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py index 277221f309..1bd250c281 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py @@ -251,4 +251,3 @@ def sequence(inTensor, notUsed, outTensor): # print(ctx.module.operation.verify()) color_threshold(dev, width, height) print(ctx.module) - From 9278ee05d431ab4f207c050968ea6ba2e7a8d473 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Wed, 4 Dec 2024 12:56:31 -0700 Subject: [PATCH 35/35] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../aie2_colorThreshold_alt.py | 48 +++++-------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py index b676b9168e..84e2de3895 100644 --- a/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py +++ b/programming_examples/vision/color_threshold/aie2_colorThreshold_alt.py @@ -46,24 +46,12 @@ def device_body(): # AIE-array data movement with object fifos # Input RGBA broadcast + memtile for skip - inOOB_L3L2 = object_fifo( - "inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty - ) - inOOB_L2L1_0 = object_fifo( - "inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty - ) - inOOB_L2L1_1 = object_fifo( - "inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty - ) - inOOB_L2L1_2 = object_fifo( - "inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty - ) - inOOB_L2L1_3 = object_fifo( - "inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty - ) - of_offsets = [ - np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4) - ] + inOOB_L3L2 = object_fifo("inOOB_L3L2", ShimTile, MemTile, 2, line_channels_ty) + inOOB_L2L1_0 = object_fifo("inOOB_L2L1_0", MemTile, ComputeTile2, 2, line_ty) + inOOB_L2L1_1 = object_fifo("inOOB_L2L1_1", MemTile, ComputeTile3, 2, line_ty) + inOOB_L2L1_2 = object_fifo("inOOB_L2L1_2", MemTile, ComputeTile4, 2, line_ty) + inOOB_L2L1_3 = object_fifo("inOOB_L2L1_3", MemTile, ComputeTile5, 2, line_ty) + of_offsets = [np.prod(np_ndarray_type_get_shape(line_ty)) * i for i in range(4)] object_fifo_link( inOOB_L3L2, [inOOB_L2L1_0, inOOB_L2L1_1, inOOB_L2L1_2, inOOB_L2L1_3], @@ -72,21 +60,11 @@ def device_body(): ) # Output RGBA - outOOB_L2L3 = object_fifo( - "outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty - ) - outOOB_L1L2_0 = object_fifo( - "outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty - ) - outOOB_L1L2_1 = object_fifo( - "outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty - ) - outOOB_L1L2_2 = object_fifo( - "outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty - ) - outOOB_L1L2_3 = object_fifo( - "outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty - ) + outOOB_L2L3 = object_fifo("outOOB_L2L3", MemTile, ShimTile, 2, line_channels_ty) + outOOB_L1L2_0 = object_fifo("outOOB_L1L2_0", ComputeTile2, MemTile, 2, line_ty) + outOOB_L1L2_1 = object_fifo("outOOB_L1L2_1", ComputeTile3, MemTile, 2, line_ty) + outOOB_L1L2_2 = object_fifo("outOOB_L1L2_2", ComputeTile4, MemTile, 2, line_ty) + outOOB_L1L2_3 = object_fifo("outOOB_L1L2_3", ComputeTile5, MemTile, 2, line_ty) object_fifo_link( [outOOB_L1L2_0, outOOB_L1L2_1, outOOB_L1L2_2, outOOB_L1L2_3], outOOB_L2L3, @@ -254,6 +232,7 @@ def sequence(inTensor, notUsed, outTensor): dma_start_task(in_task, out_task) dma_await_task(in_task, out_task) + try: device_name = str(sys.argv[1]) if device_name == "npu": @@ -267,7 +246,6 @@ def sequence(inTensor, notUsed, outTensor): except ValueError: print("Argument has inappropriate value") with mlir_mod_ctx() as ctx: - #print(ctx.module.operation.verify()) + # print(ctx.module.operation.verify()) color_threshold(dev, width, height) print(ctx.module) -