Add a pass to chain DMA BDs #931

Yu-Zhewen · 2024-11-26T00:23:12Z

Controlcode before

amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 0 : ui32, offset = 0 : ui32}
%99 = amdaie.npu.push_to_queue async {bd_id = 12 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%100 = amdaie.npu.push_to_queue async {bd_id = 12 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%99 : !amdaie.async_token)
amdaie.npu.dma_wait(%100 : !amdaie.async_token)
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 0 : ui32, offset = 0 : ui32}
%101 = amdaie.npu.push_to_queue async {bd_id = 13 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%102 = amdaie.npu.push_to_queue async {bd_id = 13 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%101 : !amdaie.async_token)
amdaie.npu.dma_wait(%102 : !amdaie.async_token)
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 0 : ui32, offset = 0 : ui32}
%103 = amdaie.npu.push_to_queue async {bd_id = 14 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%104 = amdaie.npu.push_to_queue async {bd_id = 14 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%103 : !amdaie.async_token)
amdaie.npu.dma_wait(%104 : !amdaie.async_token)

Controlcode After:

amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 13 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 0 : ui32, offset = 0 : ui32}
amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 13 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 1 : ui32, offset = 32768 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 14 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 0 : ui32, offset = 0 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 14 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 1 : ui32, offset = 32768 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 0 : ui32, offset = 0 : ui32}
%79 = amdaie.npu.push_to_queue async {bd_id = 2 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%80 = amdaie.npu.push_to_queue async {bd_id = 2 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%79 : !amdaie.async_token)
amdaie.npu.dma_wait(%80 : !amdaie.async_token)

The numbers for push_to_queue and dma_wait operations are reduced.

Results:

Test (MxKxN)	Instruction Size Before (Words)	Instruction Size After (Words)	Kernel Time Before (ms)	Kernel Time After (ms)
512x512x4096	34956	25516	17.79	17.77
512x512x4096 uKernel	34956	25516	13.03	9.58

jtuyls

Thanks @Yu-Zhewen ! It looks like there is some potential here to reduce the latency, but I think it would be good to compare the average on a large number of runs, see one of my comments below on how to do that.

build_tools/ci/cpu_comparison/performance_summarizer.py

jtuyls · 2024-11-26T10:09:49Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp

@@ -58,6 +58,13 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
    return success();
  };

+  // TODO(jornt): Temporarily use channel 0 for all DMAs. This should


I think we can move this pass after AMDAIEAssignChannelsPass now by reordering a bunch of the passes in the pipeline. This can be done here or a follow=up PR though if you prefer.

I will have it in a separate PR

jtuyls · 2024-11-26T10:20:22Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp

+  passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
  passManager.addPass(createCSEPass());
  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createAMDAIEDmaCSEPass());

-  passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
+  passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());


Re-arranging the order of these loops will significantly impact compilation time if there is a lot of control code. This might not be visible right now because we have spent quite a bit of effort already on optimizing control code and the matmuls being run in CI are not the largest, but we should consider this. I think you could avoid this by making the BD ID an semi-affine expression on the loop induction variable.

I would prefer a separate PR to investigate rearranging the order of these passes. Ideally, I want to have BD assignment, loop unrolling and BD chaining close to each other.

I would prefer a separate PR to investigate rearranging the order of these passes.

My comment was on that you are already re-arranging here and therefore this does have an impact. As mentioned, we should try to have all controlcode transformations operating on non-unrolled IR. Could you try adjust the AssignNpuDmaBdIds pass to insert bd_id that are a semi-affine expression on the loop induction value instead?

jtuyls · 2024-11-26T10:26:47Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaUtils.cpp

@@ -396,4 +396,37 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
  return success();
 }

+LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter,


Could you provide the rationale for why this transformation is needed with an example?

added some comments in the file, but a bit more explaination here

%0 = dma_cpy_nd (arg_idx = 0) dma_wait %0 %1 = dma_cpy_nd (arg_idx = 1) dma_wait %1 %2 = dma_cpy_nd (arg_idx = 0) dma_wait %2 %3 = dma_cpy_nd (arg_idx = 1) dma_wait %3

Somehow during the DMA chaining, I wish to keep dma_wait %2. Since everything is unrolled, it is hard to tell if dma_wait %1 and dma_wait %2 have any data dependency. Therefore, the safest option is to stop %3 = dma_cpy_nd (arg_idx = 1) being chained with %1 = dma_cpy_nd (arg_idx = 1) so that dma_wait %1 can be kept for synchroinzation. After applying this transformation, we get

%0 = dma_cpy_nd (arg_idx = 0) %1 = dma_cpy_nd (arg_idx = 1) dma_wait %0 dma_wait %1 %2 = dma_cpy_nd (arg_idx = 0) %3 = dma_cpy_nd (arg_idx = 1) dma_wait %2 dma_wait %3

where dma_wait is grouped together, and it is easy to tell if we wish to keep dma_wait %2, we don't have to keep dma_wait %1 at the same time, so more chaining can happen

I don't think you need to be concerned about the dma_wait ops while chaining, just about the resources (connections the dma ops are operating on). If two dma_cpy_nd are operating on the same connection with no other operation in between, they can be chained and any wait on a dma_cpy_nd that is not the last one can be removed.

I think the reason is when further lowering the code, all chained DMA operations will share one push_to_queue at the end of chain. This means their repeat_count, direction, channel, etc., must also be identical. Therefore, my current assumption is to build separate chains based on arg_idx.

direction and channel are part of the connection and repeat_count is part of the dma_cpy_nd, so there's no need to depend on arg_idx at all. In principal, you could build a chain operating on different args, there's no issue with that.

jtuyls · 2024-11-26T10:50:04Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaBdChain.cpp

+
+namespace {
+
+LogicalResult dmaBdChain(AMDAIE::WorkgroupOp workgroupOp) {


Extending the IR ops at a higher level to accommodate a BD chain will likely lead to simpler and more extensible logic. I think NpuHalfDmaCpyNdOp might be a good op to add a BD chaining capabilities to (you would need to extract that partially from the ControlCodeLowering pass). This could accomplished with a next_bd_id operand, although some transformations might not be ideal for that, so at some point we need a more expressively explicit way to represent a BD chains, like NpuHalfDmaCpyNdOp referring to each other (the hardest issue is they can be cyclic).

Thanks, I am going to split ControlCodeLowering into two separate passes, one lowering DmaCpyNd to HalfDmaCpyNd, and the other further lowering HalfDmaCpyNd to WriteBd + AdressPatch + PushToQueue. I will add the DmaBdChain pass between

jtuyls · 2024-11-26T10:55:17Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp

-        // refactoring.
-        std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
+        std::optional<uint32_t> bdId = generator.getAndAssignBdId(
+            channel, BdIdAssignmentMode::Incremental);


What's the reasoning behind using incremental mode?

Since dma_wait releases bd_id , previously we had:

%0 = dma_cpy_nd(bd_id=0) dma_wait(%0) %1 = dma_cpy_nd(bd_id=0) dma_wait(%1) ...

using incremental code, the goal is to have different ids which can be chained later

%0 = dma_cpy_nd(bd_id=0) dma_wait(%0) %1 = dma_cpy_nd(bd_id=1) dma_wait(%1) ...

During the AMDAIEAssignNpuDmaBdIds pass, I am currently not getting rid of any redundant dma_wait, which is handled later in AMDAIEDmaBdChain where the chains are actually built

During the AMDAIEAssignNpuDmaBdIds pass, I am currently not getting rid of any redundant dma_wait

Right, yeah, I see. Otherwise, 0 will always be the smallest available bd_id.

jtuyls · 2024-11-26T10:57:39Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp

@@ -160,6 +159,34 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
            shimTileToGeneratorMap[tileOp.getResult()];
        uint32_t value = bdIdOp.getValue();
        generator.releaseBdId(value);
+        // TODO(zhewen): This is a temporary solution to make any BD ID wrap


What's the issue here? I would think you can just wrap and not care about the DMA wait ops?

I was only allowing the chaining if next_bd > current_bd, so I may skip some values (by wrapping earlier) to avoid a short chain. However, such a constraint as well as this piece of code has been removed, after refactoring the BD ID chaining pass.

jtuyls · 2024-11-26T10:58:38Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignNpuDmaBdIds.cpp

+        //
+        // Not allowed (wrap occurs before NpuDmaWaitOps):
+        //     NpuDmaCpyNdOps (id=15),
+        //     NpuDmaCpyNdOps (id=0),  // Wrap happens here


Why would this happen? id == 0 should only be released after the wait ops, so how can it be assigned before?

This could happen since I am using an incremental code, and 'id==0' might have been released before. Anyway, this code is no longer neeeded

jtuyls · 2024-11-26T11:02:28Z

runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.h

+  // Resets the last used index for Incremental mode
+  void resetLastUsedBdId(uint32_t channel, uint32_t reservedNum) {
+    size_t maxBdId = channelToValidBdIds[channel].size() - 1;
+    if (lastUsedBdId + reservedNum > maxBdId) {
+      lastUsedBdId = std::numeric_limits<uint32_t>::max();
+    }
+  }


Why do we need this? I think the incremental mode is already circular, so when is a reset really needed?

removed, same reason as before

README.md

This reverts commit 2940ed5.

…ull1.log

jtuyls

I think it's helpful to separate this PR in 3/4different ones:

Add incremental mode to AssignNpuDmaBdIds
Separate DmaToHalfDma from ControlCodeLowering
Add the BD chaining pass
I don't understand yet why/whether moveNpuSourceDmaSyncAfterTargetDmaCpy is needed, but that might warrant its own pass.

jtuyls · 2024-11-29T12:48:31Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td

+    If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of 
+    the next DMA operation in the chain. Within a chain, the `start_bd` operand
+    identifies the BD ID of the first DMA operation in the sequence. 
+    When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`.


It would be good to check that in the verifier.

jtuyls · 2024-11-29T12:50:06Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td

@@ -620,7 +628,10 @@ def AMDAIE_NpuHalfDmaCpyNdOp
        DenseI64ArrayAttr:$static_sizes,
        DenseI64ArrayAttr:$static_strides,
        Optional<Index>:$bd_id,
-        Optional<Index>:$channel
+        Optional<Index>:$channel,
+        BoolAttr:$use_next_bd,


I think it might be useful to make this optional as well.

jtuyls · 2024-11-29T12:53:43Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir

+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>


Add some new roundtrip checks with next_bd and start_bd as well.

jtuyls · 2024-11-29T12:57:07Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp

+  passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
  passManager.addPass(createCSEPass());
  passManager.addPass(createCanonicalizerPass());
+  passManager.addPass(createAMDAIEDmaCSEPass());

-  passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
+  passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());


I would prefer a separate PR to investigate rearranging the order of these passes.

My comment was on that you are already re-arranging here and therefore this does have an impact. As mentioned, we should try to have all controlcode transformations operating on non-unrolled IR. Could you try adjust the AssignNpuDmaBdIds pass to insert bd_id that are a semi-affine expression on the loop induction value instead?

jtuyls · 2024-11-29T12:58:46Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h

@@ -121,8 +121,12 @@ std::unique_ptr<Pass> createAMDAIEControlCodeForallToForPass();
 /// Pass to unroll the loops within the control code regions.
 std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();

-/// Pass to convert control code DMA operations into NPU writes and syncs.
-std::unique_ptr<Pass> createAMDAIEControlCodeLoweringPass();
+/// Pass to convert control code DMA operations into HalfDmaCpyNd


Suggested change

/// Pass to convert control code DMA operations into HalfDmaCpyNd

/// Pass to convert control code DMA operations into HalfDmaCpyNd.

jtuyls · 2024-11-29T13:10:30Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToNpu.cpp

+    bool useNextBd = op.getUseNextBd();
+    if (useNextBd)
+      // Erase if not end of chain.
+      rewriter.eraseOp(*npuPushToQueueOp);
+    else {


Suggested change

bool useNextBd = op.getUseNextBd();

if (useNextBd)

// Erase if not end of chain.

rewriter.eraseOp(*npuPushToQueueOp);

else {

bool useNextBd = op.getUseNextBd();

if (useNextBd) {

// Erase if not end of chain.

rewriter.eraseOp(*npuPushToQueueOp);

} else {

https://google.github.io/styleguide/cppguide.html#Formatting_Looping_Branching

jtuyls · 2024-11-29T13:23:36Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaBdChain.cpp

+
+namespace {
+
+LogicalResult dmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel,


A couple of high-level comments before reviewing in depth:

The chaining should be done based on connections instead of arg_idx.

I am ok doing this as a follow-up, but at some point this transformation should really operate on non-unrolled IR. That likely needs quite a bit more thought and work, so I think this is a good POC, but add it as a TODO somewhere in the file.

jtuyls · 2024-11-29T13:24:17Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaBdChain.cpp

+  // Only keep DMA Wait Ops if at the end of a chain, erase others
+  res = controlCodeOp->walk([&](Operation *op) {
+    if (auto npuDmaWaitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op)) {
+      bool toErase = true;
+      for (Value token : npuDmaWaitOp.getAsyncTokens()) {
+        auto npuHalfDmaCpyNdOp = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+            token.getDefiningOp());
+        bool chaining = npuHalfDmaCpyNdOp && npuHalfDmaCpyNdOp.getUseNextBd();
+        if (!chaining) {
+          toErase = false;
+          break;
+        }
+      }
+      if (toErase) {
+        rewriter.eraseOp(npuDmaWaitOp);
+      }
+    }
+    return WalkResult::advance();


Separate into a standalone function.

jtuyls · 2024-11-29T13:25:12Z

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td

-    Pass<"iree-amdaie-controlcode-lowering", ""> {
+def AMDAIEControlCodeToHalfDmaCpyNd :
+    Pass<"iree-amdaie-controlcode-to-half-dma-cpy-nd", ""> {
+  let summary = "Lower control code ops to HalfDmaCpyNd operations";


Suggested change

let summary = "Lower control code ops to HalfDmaCpyNd operations";

let summary = "Lower DmaCpNd ops to HalfDmaCpyNd ops";

jtuyls · 2024-11-29T13:27:45Z

runtime/src/iree-amd-aie/aie_runtime/Utils/ChannelBdIdGenerator.cpp

@@ -9,19 +9,39 @@
 namespace mlir::iree_compiler::AMDAIE {

 std::optional<uint32_t> ChannelBdIdGenerator::getAndAssignBdId(
-    uint32_t channel) {
+    uint32_t channel, BdIdAssignmentMode mode) {


Add new tests for the Incremental mode in ChannelBdIdGeneratorTest.cpp

Yu-Zhewen added 5 commits November 20, 2024 11:26

fix typos

5b9b1d4

add AMDAIEDmaBdChain pass

fc2f42f

fix test

33aa97b

merge main

3d850db

support multi columns

2c857a6

Yu-Zhewen requested review from makslevental, MaheshRavishankar, nirvedhmeshram, yzhang93, Abhishek-Varma and jtuyls as code owners November 26, 2024 00:23

update performance_summarizer script

2940ed5

jtuyls requested changes Nov 26, 2024

View reviewed changes

jtuyls reviewed Nov 26, 2024

View reviewed changes

README.md Outdated Show resolved Hide resolved

Yu-Zhewen mentioned this pull request Nov 26, 2024

[CI] Additionally print mean time of runs in performance_summarizer #935

Closed

Yu-Zhewen added 8 commits November 27, 2024 12:51

split controlcode_lowering into two passes

bb56113

fix CMakeLists.txt

30c65f7

add next_bd operand to HalfDmaCpyNd

c527a09

refactor

49a0a5b

Revert "update performance_summarizer script"

a8c52b5

This reverts commit 2940ed5.

Merge branch 'main' into zhewen_bdchain

770fd34

cleanup

3104c84

Delete compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/f…

358a3d7

…ull1.log

Yu-Zhewen requested a review from jtuyls November 29, 2024 11:22

jtuyls reviewed Nov 29, 2024

View reviewed changes

Yu-Zhewen mentioned this pull request Nov 29, 2024

[AMDAIEControlCodeLowering] Separate DmaCpyNdToHalfDmaCpyNd to a new pass #945

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add a pass to chain DMA BDs #931

Add a pass to chain DMA BDs #931

Yu-Zhewen commented Nov 26, 2024

jtuyls left a comment

jtuyls Nov 26, 2024

Yu-Zhewen Nov 26, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 29, 2024 •

edited

Loading

jtuyls Nov 29, 2024

Yu-Zhewen Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 26, 2024

Yu-Zhewen Nov 29, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 26, 2024 •

edited

Loading

jtuyls Nov 26, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 29, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 29, 2024

jtuyls Nov 26, 2024

Yu-Zhewen Nov 29, 2024

jtuyls left a comment

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024

jtuyls Nov 29, 2024


		namespace {

		LogicalResult dmaBdChain(AMDAIE::WorkgroupOp workgroupOp) {

	/// Pass to convert control code DMA operations into HalfDmaCpyNd
	/// Pass to convert control code DMA operations into HalfDmaCpyNd.


		namespace {

		LogicalResult dmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel,

	let summary = "Lower control code ops to HalfDmaCpyNd operations";
	let summary = "Lower DmaCpNd ops to HalfDmaCpyNd ops";

Add a pass to chain DMA BDs #931

Are you sure you want to change the base?

Add a pass to chain DMA BDs #931

Conversation

Yu-Zhewen commented Nov 26, 2024

jtuyls left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Yu-Zhewen Nov 29, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Yu-Zhewen Nov 26, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jtuyls left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Yu-Zhewen Nov 29, 2024 •

edited

Loading

Yu-Zhewen Nov 26, 2024 •

edited

Loading