Skip to content

Commit

Permalink
[compiler] support emit byre.alias in remove-copy (#425)
Browse files Browse the repository at this point in the history
as title

Co-authored-by: Chenhui Huang <huangchenhui.yellow@bytedance.com>
  • Loading branch information
qingyunqu and YellowHCH authored Aug 14, 2024
1 parent 13ee48a commit 3b1ad12
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class FuncOp;
} // namespace func

void populateRemoveCopyAfterBufferizationPattern(RewritePatternSet &patterns,
DominanceInfo &domInfo);
DominanceInfo &domInfo,
bool enableByreAlias);

std::unique_ptr<OperationPass<func::FuncOp>> createRemoveCopyPass();

Expand Down
34 changes: 27 additions & 7 deletions compiler/lib/Dialect/MemRef/Transforms/RemoveCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
//===----------------------------------------------------------------------===//

#include "byteir/Dialect/MemRef/Transforms/RemoveCopy.h"
#include "byteir/Dialect/Byre/ByreDialect.h"
#include "byteir/Dialect/MemRef/Utils/MemEffect.h"
#include "byteir/Utils/Hoist.h"
#include "byteir/Utils/MemUtils.h"
#include "byteir/Utils/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
Expand Down Expand Up @@ -144,8 +146,10 @@ int64_t extractOffset(MemRefType memref) {

class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
public:
RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom)
: OpRewritePattern(context), domInfo(dom) {}
RemoveCopyPattern(MLIRContext *context, DominanceInfo &dom,
bool enableByreAlias)
: OpRewritePattern(context), domInfo(dom),
enableByreAlias(enableByreAlias) {}

LogicalResult matchAndRewrite(memref::CopyOp copyOp,
PatternRewriter &rewriter) const override {
Expand Down Expand Up @@ -286,6 +290,8 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {

auto sourceMemref = src.getType().cast<MemRefType>();
auto targetMemref = target.getType().cast<MemRefType>();
// target generated by memref.alloc(), it must be identity.
assert(targetMemref.getLayout().isIdentity());
int64_t srcMemrefOffset = 0;
int64_t tgtMemrefOffset = 0;
SmallVector<int64_t> srcStrides;
Expand All @@ -310,8 +316,13 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {
copyOp.getLoc(), targetMemref, src, tgtMemrefOffset,
targetMemref.getShape(), tgtStrides);
} else {
// TODO: use some op like memref.reinterpret_cast to handle offset
return failure();
if (this->enableByreAlias) {
// use byre.alias to decouple offset from memref type
srcCast = rewriter.create<byre::AliasOp>(
copyOp.getLoc(), targetMemref, src, srcMemrefOffset);
} else {
return failure();
}
}
} else {
srcCast = rewriter.create<memref::CastOp>(copyOp.getLoc(),
Expand Down Expand Up @@ -392,6 +403,7 @@ class RemoveCopyPattern : public OpRewritePattern<memref::CopyOp> {

private:
DominanceInfo &domInfo;
bool enableByreAlias;
};

struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
Expand All @@ -400,10 +412,17 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
void runOnOperation() override {

func::FuncOp funcOp = getOperation();
bool isByreEntryFunc =
funcOp->hasAttrOfType<UnitAttr>(
byre::ByreDialect::getEntryPointFunctionAttrName()) ||
funcOp->hasAttrOfType<UnitAttr>(getAttrPlaceholderName(
byre::ByreDialect::getEntryPointFunctionAttrName()));

auto &domInfo = getAnalysis<DominanceInfo>();
auto &ctx = getContext();
RewritePatternSet patterns(&ctx);
populateRemoveCopyAfterBufferizationPattern(patterns, domInfo);
populateRemoveCopyAfterBufferizationPattern(
patterns, domInfo, /*enableByreAlias=*/isByreEntryFunc);

// also insert related canonicalizer
memref::AllocOp::getCanonicalizationPatterns(patterns, &ctx);
Expand All @@ -429,8 +448,9 @@ struct RemoveCopyPass : public RemoveCopyBase<RemoveCopyPass> {
} // namespace

void mlir::populateRemoveCopyAfterBufferizationPattern(
RewritePatternSet &patterns, DominanceInfo &domInfo) {
patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo);
RewritePatternSet &patterns, DominanceInfo &domInfo, bool enableByreAlias) {
patterns.add<RemoveCopyPattern>(patterns.getContext(), domInfo,
enableByreAlias);
}

std::unique_ptr<OperationPass<func::FuncOp>> mlir::createRemoveCopyPass() {
Expand Down
20 changes: 20 additions & 0 deletions compiler/test/Dialect/MemRef/removeCopy.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -667,3 +667,23 @@ func.func @stride_copy(%arg0: memref<32x64xf32>) -> (memref<1x16x1xf32>) attrib

// CHECK-LABEL: func.func @stride_copy
// CHECK-NOT: memref.copy

// -----

func.func @byre_alias(%arg0: memref<512x200xf32>, %arg1: memref<512x200xf32>) -> (memref<256x256xf32>) attributes {__placeholder__byre.entry_point} {
%subview = memref.subview %arg0[0, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1]>>
%subview_0 = memref.subview %arg1[10, 0] [128, 200] [1, 1] : memref<512x200xf32> to memref<128x200xf32, strided<[200, 1], offset: 2000>>
%collapse_shape = memref.collapse_shape %subview [[0, 1]] : memref<128x200xf32, strided<[200, 1]>> into memref<25600xf32, strided<[1]>>
%expand_shape = memref.expand_shape %collapse_shape [[0, 1]] output_shape [256, 100] : memref<25600xf32, strided<[1]>> into memref<256x100xf32>
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
return %alloc : memref<256x256xf32>
}

// CHECK-LABEL: func.func @byre_alias
// CHECK-NOT: memref.copy
// CHECK: byre.alias
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ module {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/6_gpu_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ module {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/7_set_space_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,9 @@ module attributes {gpu.container_module} {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>> into memref<25600xf32, strided<[1], offset: 2000>>
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>> into memref<100x256xf32, strided<[256, 1], offset: 2000>>
%alloc = memref.alloc() : memref<256x256xf32>
%alloc_3 = memref.alloc() : memref<100x256xf32>
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>> to memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %0 : memref<256x256xf32>, memref<512x200xf32>
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> : (memref<100x256xf32, strided<[256, 1], offset: 2000>>) -> memref<100x256xf32>
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32>, memref<100x256xf32>, memref<256x256xf32>
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32>, memref<512x200xf32>) -> memref<512x200xf32>
return %alloc, %1 : memref<256x256xf32>, memref<512x200xf32>
}
}
9 changes: 4 additions & 5 deletions compiler/test/E2E/CUDA/AliasLikeGPU/8_byre_opt.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@ module attributes {gpu.container_module} {
%collapse_shape_1 = memref.collapse_shape %subview_0 [[0, 1]] : memref<128x200xf32, strided<[200, 1], offset: 2000>, "cuda"> into memref<25600xf32, strided<[1], offset: 2000>, "cuda">
%expand_shape_2 = memref.expand_shape %collapse_shape_1 [[0, 1]] output_shape [100, 256] : memref<25600xf32, strided<[1], offset: 2000>, "cuda"> into memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">
%alloc = memref.alloc() : memref<256x256xf32, "cuda">
%alloc_3 = memref.alloc() : memref<100x256xf32, "cuda">
memref.copy %expand_shape_2, %alloc_3 : memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda"> to memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %alloc_3, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
%0 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
return %alloc, %0 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
%0 = "byre.alias"(%expand_shape_2) <{offset = 2000 : i64}> {device = "cuda"} : (memref<100x256xf32, strided<[256, 1], offset: 2000>, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%expand_shape, %0, %alloc) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
%1 = call @Unknown0(%arg0, %arg1) : (memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">) -> memref<512x200xf32, "cuda">
return %alloc, %1 : memref<256x256xf32, "cuda">, memref<512x200xf32, "cuda">
}
}
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/9a_byre_host.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
}
}
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/9b_nvvm_codegen.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ module attributes {byre.container_module, gpu.container_module} {
}
}
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down
5 changes: 1 addition & 4 deletions compiler/test/E2E/CUDA/AliasLikeGPU/host_output.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,8 @@

module attributes {byre.container_module, gpu.container_module} {
func.func @main(%arg0: memref<512x200xf32, "cuda"> {byre.argname = "Input0", byre.argtype = 1 : i32}, %arg1: memref<512x200xf32, "cuda"> {byre.argname = "Input1", byre.argtype = 1 : i32}, %arg2: memref<256x256xf32, "cuda"> {byre.argname = "Output0", byre.argtype = 2 : i32}, %arg3: memref<512x200xf32, "cuda"> {byre.argname = "Output1", byre.argtype = 2 : i32}) attributes {byre.entry_point, device_file_name = "your_file"} {
%alloc = memref.alloc() : memref<102400xi8, "cuda">
%0 = "byre.alias"(%arg0) <{offset = 0 : i64}> : (memref<512x200xf32, "cuda">) -> memref<256x100xf32, "cuda">
%1 = "byre.alias"(%alloc) <{offset = 0 : i64}> : (memref<102400xi8, "cuda">) -> memref<100x256xf32, "cuda">
%2 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.copy(%2, %1) {callee = "cuda2cuda"} : memref<100x256xf32, "cuda">, memref<100x256xf32, "cuda">
%1 = "byre.alias"(%arg1) <{offset = 2000 : i64}> : (memref<512x200xf32, "cuda">) -> memref<100x256xf32, "cuda">
byre.compute @MatmulOp_f32f32_f32(%0, %1, %arg2) {device = "cuda", lhs_contracting_dimension = 1 : i64, memory_effects = [1 : i32, 1 : i32, 2 : i32], rhs_contracting_dimension = 0 : i64} : memref<256x100xf32, "cuda">, memref<100x256xf32, "cuda">, memref<256x256xf32, "cuda">
byre.compute @PTXOp(%arg0, %arg1, %arg3) {BlockSize.x = 256 : i32, GridSize.x = 100 : i32, arg_ranks = [2 : i32, 2 : i32, 2 : i32], kernel_name = "Unknown0", memory_effects = [1 : i32, 1 : i32, 2 : i32]} : memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">, memref<512x200xf32, "cuda">
return
Expand Down

0 comments on commit 3b1ad12

Please sign in to comment.