Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a pass to chain DMA BDs #931

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open

Add a pass to chain DMA BDs #931

wants to merge 14 commits into from

Conversation

Yu-Zhewen
Copy link
Contributor

Controlcode before

amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 0 : ui32, offset = 0 : ui32}
%99 = amdaie.npu.push_to_queue async {bd_id = 12 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%100 = amdaie.npu.push_to_queue async {bd_id = 12 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%99 : !amdaie.async_token)
amdaie.npu.dma_wait(%100 : !amdaie.async_token)
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 0 : ui32, offset = 0 : ui32}
%101 = amdaie.npu.push_to_queue async {bd_id = 13 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%102 = amdaie.npu.push_to_queue async {bd_id = 13 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%101 : !amdaie.async_token)
amdaie.npu.dma_wait(%102 : !amdaie.async_token)
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 0 : ui32, offset = 0 : ui32}
%103 = amdaie.npu.push_to_queue async {bd_id = 14 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%104 = amdaie.npu.push_to_queue async {bd_id = 14 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%103 : !amdaie.async_token)
amdaie.npu.dma_wait(%104 : !amdaie.async_token)

Controlcode After:

amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 13 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 0 : ui32, offset = 0 : ui32}
amdaie.npu.write_bd {bd_id = 12 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 13 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 12 : ui32, col = 1 : ui32, offset = 32768 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 14 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 0 : ui32, offset = 0 : ui32}
amdaie.npu.write_bd {bd_id = 13 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 14 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = true, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 13 : ui32, col = 1 : ui32, offset = 32768 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 0 : ui32, offset = 0 : ui32}
%79 = amdaie.npu.push_to_queue async {bd_id = 2 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.write_bd {bd_id = 14 : ui32, buffer_length = 8192 : ui32, buffer_offset = 0 : ui32, col = 1 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 16, 32, 16>, strides = array<i32: 16, 256, 1>, use_next_bd = false, valid_bd = true}
amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 14 : ui32, col = 1 : ui32, offset = 32768 : ui32}
%80 = amdaie.npu.push_to_queue async {bd_id = 2 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
amdaie.npu.dma_wait(%79 : !amdaie.async_token)
amdaie.npu.dma_wait(%80 : !amdaie.async_token)

The numbers for push_to_queue and dma_wait operations are reduced.

Results:

Test (MxKxN) Instruction Size Before (Words) Instruction Size After (Words) Kernel Time Before (ms) Kernel Time After (ms)
512x512x4096 34956 25516 17.79 17.77
512x512x4096 uKernel 34956 25516 13.03 9.58

Copy link
Collaborator

@jtuyls jtuyls left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @Yu-Zhewen ! It looks like there is some potential here to reduce the latency, but I think it would be good to compare the average on a large number of runs, see one of my comments below on how to do that.

build_tools/ci/cpu_comparison/performance_summarizer.py Outdated Show resolved Hide resolved
build_tools/ci/cpu_comparison/performance_summarizer.py Outdated Show resolved Hide resolved
@@ -58,6 +58,13 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
return success();
};

// TODO(jornt): Temporarily use channel 0 for all DMAs. This should
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can move this pass after AMDAIEAssignChannelsPass now by reordering a bunch of the passes in the pipeline. This can be done here or a follow=up PR though if you prefer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will have it in a separate PR

Comment on lines +643 to +648
passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEDmaCSEPass());

passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re-arranging the order of these loops will significantly impact compilation time if there is a lot of control code. This might not be visible right now because we have spent quite a bit of effort already on optimizing control code and the matmuls being run in CI are not the largest, but we should consider this. I think you could avoid this by making the BD ID an semi-affine expression on the loop induction variable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a separate PR to investigate rearranging the order of these passes. Ideally, I want to have BD assignment, loop unrolling and BD chaining close to each other.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a separate PR to investigate rearranging the order of these passes.

My comment was on that you are already re-arranging here and therefore this does have an impact. As mentioned, we should try to have all controlcode transformations operating on non-unrolled IR. Could you try adjust the AssignNpuDmaBdIds pass to insert bd_id that are a semi-affine expression on the loop induction value instead?

@@ -396,4 +396,37 @@ LogicalResult moveNpuDmaSyncUsersAfterAncestorInSameBlock(
return success();
}

LogicalResult moveNpuSourceDmaSyncAfterTargetDmaCpy(RewriterBase &rewriter,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you provide the rationale for why this transformation is needed with an example?

Copy link
Contributor Author

@Yu-Zhewen Yu-Zhewen Nov 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added some comments in the file, but a bit more explaination here

%0 = dma_cpy_nd (arg_idx = 0)
dma_wait %0 
%1 = dma_cpy_nd (arg_idx = 1)
dma_wait %1 
%2 = dma_cpy_nd (arg_idx = 0)
dma_wait %2 
%3 = dma_cpy_nd (arg_idx = 1)
dma_wait %3 

Somehow during the DMA chaining, I wish to keep dma_wait %2. Since everything is unrolled, it is hard to tell if dma_wait %1 and dma_wait %2 have any data dependency. Therefore, the safest option is to stop %3 = dma_cpy_nd (arg_idx = 1) being chained with %1 = dma_cpy_nd (arg_idx = 1) so that dma_wait %1 can be kept for synchroinzation. After applying this transformation, we get

%0 = dma_cpy_nd (arg_idx = 0)
%1 = dma_cpy_nd (arg_idx = 1)
dma_wait %0 
dma_wait %1 
%2 = dma_cpy_nd (arg_idx = 0)
%3 = dma_cpy_nd (arg_idx = 1)
dma_wait %2 
dma_wait %3 

where dma_wait is grouped together, and it is easy to tell if we wish to keep dma_wait %2, we don't have to keep dma_wait %1 at the same time, so more chaining can happen

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need to be concerned about the dma_wait ops while chaining, just about the resources (connections the dma ops are operating on). If two dma_cpy_nd are operating on the same connection with no other operation in between, they can be chained and any wait on a dma_cpy_nd that is not the last one can be removed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the reason is when further lowering the code, all chained DMA operations will share one push_to_queue at the end of chain. This means their repeat_count, direction, channel, etc., must also be identical. Therefore, my current assumption is to build separate chains based on arg_idx.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

direction and channel are part of the connection and repeat_count is part of the dma_cpy_nd, so there's no need to depend on arg_idx at all. In principal, you could build a chain operating on different args, there's no issue with that.


namespace {

LogicalResult dmaBdChain(AMDAIE::WorkgroupOp workgroupOp) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extending the IR ops at a higher level to accommodate a BD chain will likely lead to simpler and more extensible logic. I think NpuHalfDmaCpyNdOp might be a good op to add a BD chaining capabilities to (you would need to extract that partially from the ControlCodeLowering pass). This could accomplished with a next_bd_id operand, although some transformations might not be ideal for that, so at some point we need a more expressively explicit way to represent a BD chains, like NpuHalfDmaCpyNdOp referring to each other (the hardest issue is they can be cyclic).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I am going to split ControlCodeLowering into two separate passes, one lowering DmaCpyNd to HalfDmaCpyNd, and the other further lowering HalfDmaCpyNd to WriteBd + AdressPatch + PushToQueue. I will add the DmaBdChain pass between

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

// refactoring.
std::optional<uint32_t> bdId = generator.getAndAssignBdId(0);
std::optional<uint32_t> bdId = generator.getAndAssignBdId(
channel, BdIdAssignmentMode::Incremental);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reasoning behind using incremental mode?

Copy link
Contributor Author

@Yu-Zhewen Yu-Zhewen Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since dma_wait releases bd_id , previously we had:

%0 = dma_cpy_nd(bd_id=0)
dma_wait(%0)
%1 = dma_cpy_nd(bd_id=0)
dma_wait(%1)
...

using incremental code, the goal is to have different ids which can be chained later

%0 = dma_cpy_nd(bd_id=0)
dma_wait(%0)
%1 = dma_cpy_nd(bd_id=1)
dma_wait(%1)
...

During the AMDAIEAssignNpuDmaBdIds pass, I am currently not getting rid of any redundant dma_wait, which is handled later in AMDAIEDmaBdChain where the chains are actually built

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

During the AMDAIEAssignNpuDmaBdIds pass, I am currently not getting rid of any redundant dma_wait

Right, yeah, I see. Otherwise, 0 will always be the smallest available bd_id.

@@ -160,6 +159,34 @@ LogicalResult assignNpuDmaBdIds(AMDAIE::WorkgroupOp workgroupOp) {
shimTileToGeneratorMap[tileOp.getResult()];
uint32_t value = bdIdOp.getValue();
generator.releaseBdId(value);
// TODO(zhewen): This is a temporary solution to make any BD ID wrap
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the issue here? I would think you can just wrap and not care about the DMA wait ops?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was only allowing the chaining if next_bd > current_bd, so I may skip some values (by wrapping earlier) to avoid a short chain. However, such a constraint as well as this piece of code has been removed, after refactoring the BD ID chaining pass.

//
// Not allowed (wrap occurs before NpuDmaWaitOps):
// NpuDmaCpyNdOps (id=15),
// NpuDmaCpyNdOps (id=0), // Wrap happens here
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would this happen? id == 0 should only be released after the wait ops, so how can it be assigned before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could happen since I am using an incremental code, and 'id==0' might have been released before. Anyway, this code is no longer neeeded

Comment on lines 53 to 59
// Resets the last used index for Incremental mode
void resetLastUsedBdId(uint32_t channel, uint32_t reservedNum) {
size_t maxBdId = channelToValidBdIds[channel].size() - 1;
if (lastUsedBdId + reservedNum > maxBdId) {
lastUsedBdId = std::numeric_limits<uint32_t>::max();
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this? I think the incremental mode is already circular, so when is a reset really needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed, same reason as before

README.md Outdated Show resolved Hide resolved
Copy link
Collaborator

@jtuyls jtuyls left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's helpful to separate this PR in 3/4different ones:

  • Add incremental mode to AssignNpuDmaBdIds
  • Separate DmaToHalfDma from ControlCodeLowering
  • Add the BD chaining pass
  • I don't understand yet why/whether moveNpuSourceDmaSyncAfterTargetDmaCpy is needed, but that might warrant its own pass.

If `use_next_bd` is `true`, the `next_bd` operand specifies the BD ID of
the next DMA operation in the chain. Within a chain, the `start_bd` operand
identifies the BD ID of the first DMA operation in the sequence.
When `use_next_bd` is `false`, the `start_bd` is set to the same value as `bd_id`.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to check that in the verifier.

@@ -620,7 +628,10 @@ def AMDAIE_NpuHalfDmaCpyNdOp
DenseI64ArrayAttr:$static_sizes,
DenseI64ArrayAttr:$static_strides,
Optional<Index>:$bd_id,
Optional<Index>:$channel
Optional<Index>:$channel,
BoolAttr:$use_next_bd,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might be useful to make this optional as well.

// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]] use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel use_next_bd = false) : !amdaie.logicalobjectfifo<memref<2048xi32>>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add some new roundtrip checks with next_bd and start_bd as well.

Comment on lines +643 to +648
passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
passManager.addPass(createCSEPass());
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createAMDAIEDmaCSEPass());

passManager.addPass(createAMDAIEControlCodeLoopUnrollPass());
passManager.addPass(createAMDAIEAssignNpuDmaBdIdsPass());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer a separate PR to investigate rearranging the order of these passes.

My comment was on that you are already re-arranging here and therefore this does have an impact. As mentioned, we should try to have all controlcode transformations operating on non-unrolled IR. Could you try adjust the AssignNpuDmaBdIds pass to insert bd_id that are a semi-affine expression on the loop induction value instead?

@@ -121,8 +121,12 @@ std::unique_ptr<Pass> createAMDAIEControlCodeForallToForPass();
/// Pass to unroll the loops within the control code regions.
std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();

/// Pass to convert control code DMA operations into NPU writes and syncs.
std::unique_ptr<Pass> createAMDAIEControlCodeLoweringPass();
/// Pass to convert control code DMA operations into HalfDmaCpyNd
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Pass to convert control code DMA operations into HalfDmaCpyNd
/// Pass to convert control code DMA operations into HalfDmaCpyNd.

Comment on lines +219 to +223
bool useNextBd = op.getUseNextBd();
if (useNextBd)
// Erase if not end of chain.
rewriter.eraseOp(*npuPushToQueueOp);
else {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool useNextBd = op.getUseNextBd();
if (useNextBd)
// Erase if not end of chain.
rewriter.eraseOp(*npuPushToQueueOp);
else {
bool useNextBd = op.getUseNextBd();
if (useNextBd) {
// Erase if not end of chain.
rewriter.eraseOp(*npuPushToQueueOp);
} else {

https://google.github.io/styleguide/cppguide.html#Formatting_Looping_Branching


namespace {

LogicalResult dmaBdChain(AMDAIE::AMDAIEDeviceModel deviceModel,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of high-level comments before reviewing in depth:

  • The chaining should be done based on connections instead of arg_idx.
  • I am ok doing this as a follow-up, but at some point this transformation should really operate on non-unrolled IR. That likely needs quite a bit more thought and work, so I think this is a good POC, but add it as a TODO somewhere in the file.

Comment on lines +272 to +289
// Only keep DMA Wait Ops if at the end of a chain, erase others
res = controlCodeOp->walk([&](Operation *op) {
if (auto npuDmaWaitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op)) {
bool toErase = true;
for (Value token : npuDmaWaitOp.getAsyncTokens()) {
auto npuHalfDmaCpyNdOp = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
token.getDefiningOp());
bool chaining = npuHalfDmaCpyNdOp && npuHalfDmaCpyNdOp.getUseNextBd();
if (!chaining) {
toErase = false;
break;
}
}
if (toErase) {
rewriter.eraseOp(npuDmaWaitOp);
}
}
return WalkResult::advance();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate into a standalone function.

Pass<"iree-amdaie-controlcode-lowering", ""> {
def AMDAIEControlCodeToHalfDmaCpyNd :
Pass<"iree-amdaie-controlcode-to-half-dma-cpy-nd", ""> {
let summary = "Lower control code ops to HalfDmaCpyNd operations";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
let summary = "Lower control code ops to HalfDmaCpyNd operations";
let summary = "Lower DmaCpNd ops to HalfDmaCpyNd ops";

@@ -9,19 +9,39 @@
namespace mlir::iree_compiler::AMDAIE {

std::optional<uint32_t> ChannelBdIdGenerator::getAndAssignBdId(
uint32_t channel) {
uint32_t channel, BdIdAssignmentMode mode) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add new tests for the Incremental mode in ChannelBdIdGeneratorTest.cpp

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants