From 82611a987b616f0b47c8ac1e340ee94c7919c947 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 18 Oct 2023 16:57:45 -0700 Subject: [PATCH] Making execution region results queue-ordered allocas. (#15149) We don't currently insert deallocas and don't track live ranges but that can come in the future as we support more control flow. For now this at least gets all of the common allocations within an invocation into the queue-ordered bucket so that we can do proper async execution and use native queue-ordered (e.g. stream-ordered allocations in CUDA) functionality. With this change the caching allocator is no longer needed for CUDA in almost all cases (besides exported function results). --- .../HAL/Conversion/StreamToHAL/Patterns.cpp | 45 +++---- .../StreamToHAL/test/resource_ops.mlir | 32 ++--- .../MaterializeDispatchInstrumentation.cpp | 8 +- .../compiler/Dialect/Stream/IR/StreamOps.cpp | 126 ++++++++++++++++-- .../compiler/Dialect/Stream/IR/StreamOps.td | 50 ++++--- .../Dialect/Stream/IR/test/resource_ops.mlir | 8 +- .../Dialect/Stream/Transforms/BUILD.bazel | 1 - .../Dialect/Stream/Transforms/CMakeLists.txt | 1 - .../Stream/Transforms/PackAllocations.cpp | 119 ----------------- .../Stream/Transforms/PackConstants.cpp | 2 +- .../Dialect/Stream/Transforms/Passes.cpp | 3 - .../Dialect/Stream/Transforms/Passes.h | 1 - .../Dialect/Stream/Transforms/Passes.td | 8 -- .../Stream/Transforms/ScheduleAllocation.cpp | 26 ++-- .../Stream/Transforms/test/BUILD.bazel | 1 - .../Stream/Transforms/test/CMakeLists.txt | 1 - .../Transforms/test/pack_allocations.mlir | 38 ------ .../Transforms/test/schedule_allocation.mlir | 81 ++++++----- .../Conversion/StreamToHALInline/Patterns.cpp | 13 +- experimental/cuda2/cuda_buffer.c | 8 ++ experimental/cuda2/cuda_device.c | 2 +- .../src/iree/hal/drivers/cuda/cuda_buffer.c | 8 ++ .../src/iree/hal/drivers/cuda/cuda_device.c | 2 +- 23 files changed, 264 insertions(+), 320 deletions(-) delete mode 100644 compiler/src/iree/compiler/Dialect/Stream/Transforms/PackAllocations.cpp delete mode 100644 compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_allocations.mlir diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp index 9265878ec664..8d31a0282297 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp +++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp @@ -332,26 +332,19 @@ struct ResourceAllocOpPattern lookupAllocatorAndQueueAffinityFor(allocOp, rewriter); auto bufferType = rewriter.getType(); - SmallVector results; - for (auto [resourceResult, storageSize] : - llvm::zip_equal(allocOp.getResults(), allocOp.getStorageSizes())) { - auto resourceType = - llvm::cast(resourceResult.getType()); - - auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None; - auto bufferUsage = IREE::HAL::BufferUsageBitfield::None; - if (failed(deriveAllowedResourceBufferBits(allocOp.getLoc(), resourceType, - memoryTypes, bufferUsage))) { - return failure(); - } + auto resourceType = + cast(allocOp.getResult().getType()); - auto allocateOp = rewriter.create( - allocOp.getLoc(), bufferType, allocator, queueAffinity, memoryTypes, - bufferUsage, storageSize); - results.push_back(allocateOp.getResult()); + auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None; + auto bufferUsage = IREE::HAL::BufferUsageBitfield::None; + if (failed(deriveAllowedResourceBufferBits(allocOp.getLoc(), resourceType, + memoryTypes, bufferUsage))) { + return failure(); } - rewriter.replaceOp(allocOp, results); + rewriter.replaceOpWithNewOp( + allocOp, bufferType, allocator, queueAffinity, memoryTypes, bufferUsage, + adaptor.getStorageSize()); return success(); } }; @@ -367,16 +360,14 @@ struct ResourceAllocaOpPattern lookupDeviceAndQueueAffinityFor(allocaOp, rewriter); auto bufferType = rewriter.getType(); - // Transient allocations are device-local. Copies are required to get their - // contents back on the host/another device. - auto memoryTypes = IREE::HAL::MemoryTypeBitfield::DeviceLocal; - - // TODO(benvanik): refine usage. - // We know by construction that transient buffers are not host visible and - // as such can only be used for device commands. We should be able to more - // closely limit to just dispatch or transfer though. - auto bufferUsage = IREE::HAL::BufferUsageBitfield::Transfer | - IREE::HAL::BufferUsageBitfield::DispatchStorage; + auto resourceType = + cast(allocaOp.getResult().getType()); + auto memoryTypes = IREE::HAL::MemoryTypeBitfield::None; + auto bufferUsage = IREE::HAL::BufferUsageBitfield::None; + if (failed(deriveAllowedResourceBufferBits(loc, resourceType, memoryTypes, + bufferUsage))) { + return failure(); + } // Gather wait/signal fence, which are optional. Value waitFence = diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir index e70f8b04263a..88cb014dff42 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir +++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/test/resource_ops.mlir @@ -1,25 +1,21 @@ // RUN: iree-opt --split-input-file --iree-hal-conversion %s | FileCheck %s // CHECK-LABEL: @resourceAlloc -func.func @resourceAlloc(%arg0: index, %arg1: index) -> (!stream.resource, !stream.resource) { +func.func @resourceAlloc(%arg0: index) -> !stream.resource { // CHECK: %[[RET0:.+]] = hal.allocator.allocate // CHECK-SAME: type("DeviceVisible|DeviceLocal") // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}") // CHECK-SAME: : !hal.buffer{%arg0} - // CHECK-NEXT: %[[RET1:.+]] = hal.allocator.allocate - // CHECK-SAME: type("DeviceVisible|DeviceLocal") - // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}") - // CHECK-SAME: : !hal.buffer{%arg1} - %0:2 = stream.resource.alloc uninitialized : !stream.resource{%arg0}, !stream.resource{%arg1} - // CHECK: return %[[RET0]], %[[RET1]] - return %0#0, %0#1 : !stream.resource, !stream.resource + %0 = stream.resource.alloc uninitialized : !stream.resource{%arg0} + // CHECK: return %[[RET0]] + return %0 : !stream.resource } // ----- // CHECK-LABEL: @resourceAlloca // CHECK-SAME: (%[[SIZE:.+]]: index) -func.func @resourceAlloca(%size: index) -> (!stream.resource, !stream.timepoint) { +func.func @resourceAlloca(%size: index) -> (!stream.resource, !stream.timepoint) { // CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create // CHECK: %[[RET0:.+]] = hal.device.queue.alloca @@ -30,16 +26,16 @@ func.func @resourceAlloca(%size: index) -> (!stream.resource, !stream.t // CHECK-SAME: type("DeviceVisible|DeviceLocal") // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}") // CHECK-SAME: : !hal.buffer{%[[SIZE]]} - %0:2 = stream.resource.alloca uninitialized : !stream.resource{%size} => !stream.timepoint + %0:2 = stream.resource.alloca uninitialized : !stream.resource{%size} => !stream.timepoint // CHECK: return %[[RET0]], %[[SIGNAL_FENCE]] - return %0#0, %0#1 : !stream.resource, !stream.timepoint + return %0#0, %0#1 : !stream.resource, !stream.timepoint } // ----- // CHECK-LABEL: @resourceAllocaAwait // CHECK-SAME: (%[[SIZE:.+]]: index, %[[WAIT_FENCE:.+]]: !hal.fence) -func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource, !stream.timepoint) { +func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint) -> (!stream.resource, !stream.timepoint) { // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create // CHECK: %[[RET0:.+]] = hal.device.queue.alloca // CHECK-SAME: affinity(%c-1 @@ -49,16 +45,16 @@ func.func @resourceAllocaAwait(%size: index, %await_timepoint: !stream.timepoint // CHECK-SAME: type("DeviceVisible|DeviceLocal") // CHECK-SAME: usage("{{.+}}Transfer{{.+}}Dispatch{{.+}}") // CHECK-SAME: : !hal.buffer{%[[SIZE]]} - %0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource{%size} => !stream.timepoint + %0:2 = stream.resource.alloca uninitialized await(%await_timepoint) => !stream.resource{%size} => !stream.timepoint // CHECK: return %[[RET0]], %[[SIGNAL_FENCE]] - return %0#0, %0#1 : !stream.resource, !stream.timepoint + return %0#0, %0#1 : !stream.resource, !stream.timepoint } // ----- // CHECK-LABEL: @resourceDealloca // CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer) -func.func @resourceDealloca(%size: index, %resource: !stream.resource) -> !stream.timepoint { +func.func @resourceDealloca(%size: index, %resource: !stream.resource) -> !stream.timepoint { // CHECK: %[[WAIT_FENCE:.+]] = util.null : !hal.fence // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create // CHECK: hal.device.queue.dealloca @@ -66,7 +62,7 @@ func.func @resourceDealloca(%size: index, %resource: !stream.resource) // CHECK-SAME: wait(%[[WAIT_FENCE]]) // CHECK-SAME: signal(%[[SIGNAL_FENCE]]) // CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer) - %0 = stream.resource.dealloca %resource : !stream.resource{%size} => !stream.timepoint + %0 = stream.resource.dealloca %resource : !stream.resource{%size} => !stream.timepoint // CHECK: return %[[SIGNAL_FENCE]] return %0 : !stream.timepoint } @@ -77,14 +73,14 @@ func.func @resourceDealloca(%size: index, %resource: !stream.resource) // CHECK-LABEL: @resourceDeallocaAwait // CHECK-SAME: (%[[SIZE:.+]]: index, %[[RESOURCE:.+]]: !hal.buffer, %[[WAIT_FENCE:.+]]: !hal.fence) -func.func @resourceDeallocaAwait(%size: index, %resource: !stream.resource, %await_timepoint: !stream.timepoint) -> !stream.timepoint { +func.func @resourceDeallocaAwait(%size: index, %resource: !stream.resource, %await_timepoint: !stream.timepoint) -> !stream.timepoint { // CHECK: %[[SIGNAL_FENCE:.+]] = hal.fence.create // CHECK: hal.device.queue.dealloca // CHECK-SAME: affinity(%c-1 // CHECK-SAME: wait(%[[WAIT_FENCE]]) // CHECK-SAME: signal(%[[SIGNAL_FENCE]]) // CHECK-SAME: buffer(%[[RESOURCE]] : !hal.buffer) - %0 = stream.resource.dealloca await(%await_timepoint) => %resource : !stream.resource{%size} => !stream.timepoint + %0 = stream.resource.dealloca await(%await_timepoint) => %resource : !stream.resource{%size} => !stream.timepoint // CHECK: return %[[SIGNAL_FENCE]] return %0 : !stream.timepoint } diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp index ec7dc3064412..e3ce36f7cd9c 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp +++ b/compiler/src/iree/compiler/Dialect/HAL/Transforms/MaterializeDispatchInstrumentation.cpp @@ -161,11 +161,9 @@ class MaterializeDispatchInstrumentationPass OpBuilder::atBlockBegin(initializerOp.addEntryBlock()); Value bufferSize = initializerBuilder.create(loc, bufferSizeAttr); - Value buffer = initializerBuilder - .create( - loc, globalOp.getType(), bufferSize, - /*uninitialized=*/true, /*affinity=*/nullptr) - .getResult(0); + Value buffer = initializerBuilder.create( + loc, globalOp.getType(), bufferSize, + /*uninitialized=*/true, /*affinity=*/nullptr); initializerBuilder.create(loc, buffer, globalOp); initializerBuilder.create(loc); diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp index 8d6db3798f69..addada816aae 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp @@ -620,22 +620,124 @@ static void printWorkgroupCountRegion(OpAsmPrinter &p, Operation *op, // stream.resource.alloc //===----------------------------------------------------------------------===// -LogicalResult ResourceAllocOp::verify() { - ResourceAllocOp op = *this; - if (failed(verifyOpValueSizes(op, op.getResults(), op.getStorageSizes()))) { - return failure(); +// static +std::pair> +ResourceAllocOp::createSuballocations( + Type resourceType, ArrayRef locs, ValueRange storageSizes, + bool uninitialized, AffinityAttr affinityAttr, OpBuilder &builder) { + assert(locs.size() == storageSizes.size() && + "expect locs and storageSizes to match"); + if (locs.empty()) + return {}; + if (locs.size() == 1) { + auto allocOp = builder.create( + locs.front(), resourceType, storageSizes.front(), uninitialized, + affinityAttr); + return {allocOp, {allocOp.getResult()}}; } + auto fusedLoc = builder.getFusedLoc(locs); - // All allocated resources must have the same lifetime. - auto anyType = op.getResults().front().getType(); - for (auto type : op.getResultTypes()) { - if (type != anyType) { - return op.emitError() - << "all allocated resources must have the same lifetime"; - } + // NOTE: this is risky: we are assuming right now that all of the + // allocations will fit within the constraints of the system. This is not + // guaranteed: a very low maximum buffer range may lead to packed slabs + // that are not fully addressable. For now we are processing models with + // small enough workloads and our target devices are relatively lax on + // things so long as we stay under UINT32_MAX boundaries. + + // All slices are 0-0 (overlapping). + size_t sliceCount = locs.size(); + SmallVector lifetimeIntervals(sliceCount * 2, 0); + + // Compute total size and the offsets of all suballocated resources via the + // pack op. + auto indexType = builder.getIndexType(); + SmallVector packedOffsetTypes(sliceCount, indexType); + auto packOp = builder.create( + fusedLoc, indexType, packedOffsetTypes, /*offset=*/nullptr, + builder.getIndexArrayAttr(lifetimeIntervals), storageSizes, affinityAttr); + + // Create the new alloca based on the total required size. + auto allocOp = builder.create( + fusedLoc, resourceType, packOp.getTotalLength(), uninitialized, + affinityAttr); + auto slab = allocOp.getResult(); + auto slabSize = packOp.getTotalLength(); + + // Create subviews for all of the suballocated resources. + SmallVector results; + for (auto [loc, subviewOffset, subviewLength] : + llvm::zip_equal(locs, packOp.getPackedOffsets(), storageSizes)) { + results.push_back(builder + .create( + loc, slab, slabSize, subviewOffset, subviewLength) + .getResult()); } + return {allocOp, results}; +} - return success(); +//===----------------------------------------------------------------------===// +// stream.resource.alloca +//===----------------------------------------------------------------------===// + +// static +std::pair> +ResourceAllocaOp::createSuballocations(Type timepointType, Type resourceType, + ArrayRef locs, + ValueRange storageSizes, + Value awaitTimepoint, + AffinityAttr affinityAttr, + OpBuilder &builder) { + assert(locs.size() == storageSizes.size() && + "expect locs and storageSizes to match"); + if (locs.empty()) + return {}; + if (locs.size() == 1) { + auto allocaOp = builder.create( + locs.front(), resourceType, timepointType, storageSizes.front(), + awaitTimepoint, affinityAttr); + return {allocaOp, {allocaOp.getResult()}}; + } + auto fusedLoc = builder.getFusedLoc(locs); + + // NOTE: this is risky: we are assuming right now that all of the + // allocations will fit within the constraints of the system. This is not + // guaranteed: a very low maximum buffer range may lead to packed slabs + // that are not fully addressable. For now we are processing models with + // small enough workloads and our target devices are relatively lax on + // things so long as we stay under UINT32_MAX boundaries. If a user starts + // hitting this the solution is to do in-place outputs such that we don't + // need to allocate them; when possible that's always going to be better than + // leaving them to the IREE compiled program to deal with. + + // All slices are 0-0 (overlapping). + size_t sliceCount = locs.size(); + SmallVector lifetimeIntervals(sliceCount * 2, 0); + + // Compute total size and the offsets of all suballocated resources via the + // pack op. + auto indexType = builder.getIndexType(); + SmallVector packedOffsetTypes(sliceCount, indexType); + auto packOp = builder.create( + fusedLoc, indexType, packedOffsetTypes, /*offset=*/nullptr, + builder.getIndexArrayAttr(lifetimeIntervals), storageSizes, affinityAttr); + + // Create the new alloca based on the total required size. + auto allocaOp = builder.create( + fusedLoc, resourceType, timepointType, packOp.getTotalLength(), + awaitTimepoint, affinityAttr); + auto slab = allocaOp.getResult(); + auto slabSize = packOp.getTotalLength(); + + // Create subviews for all of the suballocated resources. + SmallVector results; + for (auto [loc, subviewOffset, subviewLength] : + llvm::zip_equal(locs, packOp.getPackedOffsets(), storageSizes)) { + results.push_back(builder + .create( + loc, slab, slabSize, subviewOffset, subviewLength) + .getResult()); + } + return {allocaOp, results}; } //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td index 7f3ed5ba0ec9..275c178ff812 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td @@ -42,7 +42,7 @@ def Stream_ResourceAllocOp : Stream_Op<"resource.alloc", [ AlwaysSpeculatable, MemoryEffects<[MemAlloc]>, ]> { - let summary = [{allocates a persistent value with undefined contents}]; + let summary = [{allocates a persistent resource}]; let description = [{ Allocates a persistent value (one that is long-lived and possibly external to the program) with undefined contents. Consumers of the allocated @@ -58,31 +58,39 @@ def Stream_ResourceAllocOp : Stream_Op<"resource.alloc", [ separate allocations may be fused into one or more slab allocations in order to reduce overheads. How many allocations can be fused is based on the size of the individual resources and the target constraints (how large any single - buffer may be, etc). At the stream dialect level treat a multi-result alloc - as a way to indicate similar lifetimes. + buffer may be, etc). }]; let arguments = (ins - Variadic:$storage_sizes, + Stream_Size:$storage_size, UnitAttr:$uninitialized, OptionalAttr:$affinity ); let results = (outs - Variadic:$results + Stream_AnyResource:$result ); let assemblyFormat = [{ (`on` `(` $affinity^ `)`)? (`uninitialized` $uninitialized^)? - attr-dict `:` custom(type($results), $storage_sizes) + attr-dict `:` + type($result) `{` $storage_size `}` }]; let extraClassDeclaration = [{ Value getOperandSize(unsigned idx) { return {}; } - Value getResultSize(unsigned idx) { return getStorageSizes()[idx]; } - }]; + Value getResultSize(unsigned idx) { return getStorageSize(); } - let hasVerifier = 1; + // Creates a single shared allocation for multiple suballocations. + // Suballocations are defined by entries in the struct-of-arrays-style + // `{locs, storageSizes}` set. Currently all result types must match. + // Returns the allocation and subviews into all suballocated resources. + static std::pair> + createSuballocations( + Type resourceType, + ArrayRef locs, ValueRange storageSizes, + bool uninitialized, AffinityAttr affinityAttr, OpBuilder &builder); + }]; let hasCanonicalizer = 1; } @@ -113,10 +121,7 @@ def Stream_ResourceAllocaOp : Stream_Op<"resource.alloca", [ OptionalAttr:$affinity ); let results = (outs - AnyTypeOf<[ - Stream_StagingResource, - Stream_TransientResource, - ]>:$result, + Stream_AnyResource:$result, Stream_Timepoint:$result_timepoint ); @@ -136,6 +141,16 @@ def Stream_ResourceAllocaOp : Stream_Op<"resource.alloca", [ SmallVector getAwaitTimepoints() { if (getAwaitTimepoint()) return {getAwaitTimepoint()}; else return {}; } + + // Creates a single shared allocation for multiple suballocations. + // Suballocations are defined by entries in the struct-of-arrays-style + // `{locs, storageSizes}` set. Currently all result types must match. + // Returns the allocation and subviews into all suballocated resources. + static std::pair> + createSuballocations( + Type timepointType, Type resourceType, + ArrayRef locs, ValueRange storageSizes, + Value awaitTimepoint, AffinityAttr affinityAttr, OpBuilder &builder); }]; let hasCanonicalizer = 1; @@ -161,10 +176,7 @@ def Stream_ResourceDeallocaOp : Stream_Op<"resource.dealloca", [ }]; let arguments = (ins - AnyTypeOf<[ - Stream_StagingResource, - Stream_TransientResource, - ]>:$operand, + Stream_AnyResource:$operand, Stream_Size:$operand_size, Optional:$await_timepoint, OptionalAttr:$affinity @@ -776,7 +788,7 @@ def Stream_TensorImportOp : Stream_PureOp<"tensor.import", [ OptionalAttr:$affinity ); let results = (outs - Stream_ExternalResource:$result + AnyTypeOf<[Stream_AnyStreamResource, Stream_StagingResource]>:$result ); let assemblyFormat = [{ @@ -822,7 +834,7 @@ def Stream_TensorExportOp : Stream_PureOp<"tensor.export", [ }]; let arguments = (ins - Stream_ExternalResource:$source, + AnyTypeOf<[Stream_AnyStreamResource, Stream_StagingResource]>:$source, TypeAttr:$source_encoding, Stream_ShapeDynamicDims:$source_encoding_dims, Stream_Size:$source_size, diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir index c42ccfee0a14..f19f53007d03 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/test/resource_ops.mlir @@ -1,10 +1,10 @@ // RUN: iree-opt --split-input-file %s | iree-opt --split-input-file | FileCheck %s // CHECK-LABEL: @resourceAlloc -func.func @resourceAlloc(%arg0: index, %arg1: index) -> (!stream.resource<*>, !stream.resource<*>) { - // CHECK: = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0}, !stream.resource<*>{%arg1} - %0:2 = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0}, !stream.resource<*>{%arg1} - return %0#0, %0#1 : !stream.resource<*>, !stream.resource<*> +func.func @resourceAlloc(%arg0: index) -> !stream.resource<*> { + // CHECK: = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0} + %0 = stream.resource.alloc uninitialized : !stream.resource<*>{%arg0} + return %0 : !stream.resource<*> } // ----- diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/BUILD.bazel index f3693a8868cf..9144ae498e37 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/BUILD.bazel @@ -27,7 +27,6 @@ iree_compiler_cc_library( "LayoutSlices.cpp", "MaterializeBuiltins.cpp", "MaterializeCopyOnWrite.cpp", - "PackAllocations.cpp", "PackConstants.cpp", "PackDispatchOperands.cpp", "PassDetail.h", diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/CMakeLists.txt index 11074572de61..80dec36d1ff7 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/CMakeLists.txt @@ -29,7 +29,6 @@ iree_cc_library( "LayoutSlices.cpp" "MaterializeBuiltins.cpp" "MaterializeCopyOnWrite.cpp" - "PackAllocations.cpp" "PackConstants.cpp" "PackDispatchOperands.cpp" "PassDetail.h" diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackAllocations.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackAllocations.cpp deleted file mode 100644 index 4e4ed6246d28..000000000000 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackAllocations.cpp +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2021 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Dialect/Stream/IR/StreamDialect.h" -#include "iree/compiler/Dialect/Stream/IR/StreamOps.h" -#include "iree/compiler/Dialect/Stream/IR/StreamTypes.h" -#include "iree/compiler/Dialect/Stream/Transforms/PassDetail.h" -#include "iree/compiler/Dialect/Stream/Transforms/Passes.h" -#include "iree/compiler/Dialect/Util/IR/UtilDialect.h" -#include "iree/compiler/Dialect/Util/IR/UtilOps.h" -#include "iree/compiler/Dialect/Util/IR/UtilTypes.h" -#include "llvm/Support/Debug.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/AsmState.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/IR/Matchers.h" -#include "mlir/Pass/Pass.h" - -#define DEBUG_TYPE "iree-stream-pack-allocations" - -namespace mlir { -namespace iree_compiler { -namespace IREE { -namespace Stream { -namespace { - -//===----------------------------------------------------------------------===// -// -iree-stream-pack-allocations -//===----------------------------------------------------------------------===// - -class PackAllocationsPass : public PackAllocationsBase { -public: - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - registry.insert(); - registry.insert(); - } - - void runOnOperation() override { - auto parentOp = getOperation(); - if (!parentOp.getCallableRegion() || - parentOp.getCallableRegion()->empty()) { - return; - } - - // This is pretty lazy: we just turn stream.resource.alloc ops into a - // stream.resource.pack + stream.resource.alloc of a single resource. - // This way we reuse all the resource constraints stuff that the pack op - // provides even though all of the resources we allocate have perfectly - // overlapping lifetime spans. - // - // In the future, we should be doing deeper lifetime analysis here and - // subdividing the allocs based on which resources travel together. We can - // also do things like overlap the lifetime of inputs and outputs to - // execution regions as usually inputs end their lifetime before the outputs - // are produced. In this way we'd use the slice intervals to denote which - // are mutually exclusive. - parentOp.walk([&](IREE::Stream::ResourceAllocOp allocOp) { - // If just one result then ignore (nothing to pack). - if (allocOp.getResults().size() == 1) - return; - auto resourceType = allocOp.getResults().front().getType(); - - // NOTE: this is risky: we are assuming right now that all of the - // allocations will fit within the constraints of the system. This is not - // guaranteed: a very low maximum buffer range may lead to packed slabs - // that are not fully addressable. For now we are processing models with - // small enough workloads and our target devices are relatively lax on - // things so long as we stay under UINT32_MAX boundaries. - - // All slices are 0-0 (overlapping). - size_t sliceCount = allocOp.getResults().size(); - SmallVector lifetimeIntervals(sliceCount * 2, 0); - - OpBuilder builder(allocOp); - auto indexType = builder.getIndexType(); - SmallVector packedOffsetTypes(sliceCount, indexType); - auto packOp = builder.create( - allocOp.getLoc(), indexType, packedOffsetTypes, /*offset=*/nullptr, - builder.getIndexArrayAttr(lifetimeIntervals), - allocOp.getStorageSizes(), allocOp.getAffinityAttr()); - - // Change the alloc to build just a single resource. - auto newOp = builder.create( - allocOp.getLoc(), resourceType, packOp.getTotalLength(), - allocOp.getUninitializedAttr(), allocOp.getAffinityAttr()); - auto slab = newOp.getResults().front(); - auto slabSize = packOp.getTotalLength(); - - // Replace all resources with subviews into the new slab. - for (auto [originalValue, subviewOffset, subviewLength] : - llvm::zip_equal(allocOp.getResults(), packOp.getPackedOffsets(), - allocOp.getStorageSizes())) { - auto subviewOp = builder.create( - allocOp.getLoc(), slab, slabSize, subviewOffset, subviewLength); - originalValue.replaceAllUsesWith(subviewOp.getResult()); - } - - allocOp.erase(); - }); - } -}; - -} // namespace - -std::unique_ptr> -createPackAllocationsPass() { - return std::make_unique(); -} - -} // namespace Stream -} // namespace IREE -} // namespace iree_compiler -} // namespace mlir diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackConstants.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackConstants.cpp index 74ad0dd93266..87c02fa8ef45 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackConstants.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/PackConstants.cpp @@ -262,7 +262,7 @@ static TimepointResource buildFileRead( auto zeroI64 = builder.create(storageResource.loc, 0, 64); auto readOp = builder.create( - storageResource.loc, fileOp.getResult(), zeroI64, allocOp.getResult(0), + storageResource.loc, fileOp.getResult(), zeroI64, allocOp.getResult(), allocOp.getResultSize(0), indexSet.get(0), storageResourceSize, awaitTimepoint, affinityAttr); diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.cpp index 797ac47732a5..fbfa6631ec37 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.cpp @@ -206,9 +206,6 @@ void buildStreamCmdPassPipeline(OpPassManager &passManager, // storage buffers and upload logic. .addPass(IREE::Stream::createPackConstantsPass) - // Pack fused allocations based on lifetime. - .addPass(IREE::Stream::createPackAllocationsPass) - // Layout packed slices to emit the arithmetic required for all resource // offsets. This enables us to propagate the subviews across the program // below. diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.h b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.h index 324937fd8d54..ca8c861f85ea 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.h +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.h @@ -128,7 +128,6 @@ std::unique_ptr> createElideTimepointsPass(); std::unique_ptr> createScheduleAllocationPass(); std::unique_ptr> createPackConstantsPass(); -std::unique_ptr> createPackAllocationsPass(); std::unique_ptr> createLayoutSlicesPass(); //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td index c11e3959e6aa..1a19d4136e92 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/Passes.td @@ -137,14 +137,6 @@ def PackConstants : }]; } -def PackAllocations : - InterfacePass<"iree-stream-pack-allocations", "mlir::CallableOpInterface"> { - let summary = "Packs fused allocations based on lifetime."; - let constructor = [{ - mlir::iree_compiler::IREE::Stream::createPackAllocationsPass() - }]; -} - def LayoutSlices : InterfacePass<"iree-stream-layout-slices", "mlir::CallableOpInterface"> { let summary = "Lays out packed slices and produces arithmetic required for all offsets."; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp index 59c104ae60fa..ba99599656c4 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp @@ -1578,28 +1578,28 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { auto resultAllocation = reserveResultAllocation(resultReservations); for (auto &reservationSet : resultAllocation.reservationSets) { // Allocate and tie an operand to the result. - // TODO(benvanik): change this to an alloca. We may need a higher-level - // analysis to decide when to deallocate, or just leave it to be deallocated - // as part of garbage collection. - auto allocOp = externalBuilder.create( - externalBuilder.getFusedLoc(reservationSet.reservationLocs), - reservationSet.reservationTypes, reservationSet.reservationSizes, - /*uninitialized=*/externalBuilder.getUnitAttr(), - executeOp.getAffinityAttr()); + auto timepointType = externalBuilder.getType(); + auto [allocaOp, suballocations] = + IREE::Stream::ResourceAllocaOp::createSuballocations( + timepointType, reservationSet.reservationTypes.front(), + reservationSet.reservationLocs, reservationSet.reservationSizes, + executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(), + externalBuilder); + newAwaitTimepoints.push_back(allocaOp.getResultTimepoint()); auto asmState = getRootAsmState(executeOp->getParentOp()); LLVM_DEBUG({ llvm::dbgs() << " + alloc for result reservation set: "; - allocOp.print(llvm::dbgs(), *asmState); + allocaOp.print(llvm::dbgs(), *asmState); llvm::dbgs() << ":\n"; }); - for (auto [reservation, allocResult] : - llvm::zip_equal(reservationSet.reservations, allocOp.getResults())) { - newOperands.push_back(allocResult); + for (auto [reservation, suballocation] : + llvm::zip_equal(reservationSet.reservations, suballocations)) { + newOperands.push_back(suballocation); newOperandSizes.push_back(reservation.resultSize); resultReplacements.push_back( - std::make_pair(reservation.result, allocResult)); + std::make_pair(reservation.result, suballocation)); // Insert entry arg for the new operand tied all the way to the yield. auto arg = diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel index d0a3103a3a40..965b61f2cc63 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel @@ -34,7 +34,6 @@ iree_lit_test_suite( "layout_slices.mlir", "materialize_builtins.mlir", "materialize_copy_on_write.mlir", - "pack_allocations.mlir", "pack_constants.mlir", "pack_dispatch_operands.mlir", "propagate_subviews.mlir", diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt index 2a1d56a05432..2e2294a00054 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt @@ -32,7 +32,6 @@ iree_lit_test_suite( "layout_slices.mlir" "materialize_builtins.mlir" "materialize_copy_on_write.mlir" - "pack_allocations.mlir" "pack_constants.mlir" "pack_dispatch_operands.mlir" "propagate_subviews.mlir" diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_allocations.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_allocations.mlir deleted file mode 100644 index e98e5ae9533f..000000000000 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/pack_allocations.mlir +++ /dev/null @@ -1,38 +0,0 @@ -// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(func.func(iree-stream-pack-allocations))' %s | FileCheck %s - -// CHECK-LABEL: @packAllocations -// CHECK-SAME: (%[[SIZE_A:.+]]: index, %[[SIZE_B:.+]]: index) -func.func @packAllocations(%size_a: index, %size_b: index) { - // CHECK: %[[SLICES:.+]]:3 = stream.resource.pack slices({ - // CHECK-NEXT: [0, 0] = %[[SIZE_A]], - // CHECK-NEXT: [0, 0] = %[[SIZE_B]] - // CHECK-NEXT: }) : index - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SLICES]]#0} - %0:2 = stream.resource.alloc uninitialized : - !stream.resource{%size_a}, - !stream.resource{%size_b} - - // CHECK: %[[SLICE_A:.+]] = stream.resource.subview %[[ALLOC]][%[[SLICES]]#1] - // CHECK-SAME: !stream.resource{%[[SLICES]]#0} -> !stream.resource{%[[SIZE_A]]} - // CHECK: %[[SLICE_B:.+]] = stream.resource.subview %[[ALLOC]][%[[SLICES]]#2] - // CHECK-SAME: !stream.resource{%[[SLICES]]#0} -> !stream.resource{%[[SIZE_B]]} - - // CHECK: util.optimization_barrier %[[SLICE_A]] - util.optimization_barrier %0#0 : !stream.resource - // CHECK: util.optimization_barrier %[[SLICE_B]] - util.optimization_barrier %0#1 : !stream.resource - return -} - -// ----- - -// CHECK-LABEL: @packEmpty -func.func @packEmpty() { - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc : !stream.resource{%c0} - %c0 = arith.constant 0 : index - %0 = stream.resource.alloc : !stream.resource{%c0} - - // CHECK: util.optimization_barrier %[[ALLOC]] - util.optimization_barrier %0 : !stream.resource - return -} diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir index 08477f30547b..1398f799813d 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir @@ -183,10 +183,16 @@ func.func @aliasPropagation(%operand: !stream.resource, %size: index, func.func @producedResults(%size0: index, %size1: index) { %c254_i32 = arith.constant 254 : i32 %c255_i32 = arith.constant 255 : i32 - // CHECK: %[[ALLOC_RETS:.+]]:2 = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE0]]}, !stream.resource{%[[SIZE1]]} - // CHECK: %[[TIMEPOINT:.+]] = stream.cmd.execute - // CHECK-SAME: with(%[[ALLOC_RETS]]#0 as %[[CAPTURE0:.+]]: !stream.resource{%[[SIZE0]]}, - // CHECK-SAME: %[[ALLOC_RETS]]#1 as %[[CAPTURE1:.+]]: !stream.resource{%[[SIZE1]]}) + // CHECK: %[[PACK:.+]]:3 = stream.resource.pack slices({ + // CHECK-NEXT: [0, 0] = %[[SIZE0]], + // CHECK-NEXT: [0, 0] = %[[SIZE1]] + // CHECK-NEXT: }) : index + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[PACK]]#0} + // CHECK: %[[SUBALLOCA0:.+]] = stream.resource.subview %[[ALLOCA]][%[[PACK]]#1] : !stream.resource{%[[PACK]]#0} -> !stream.resource{%[[SIZE0]]} + // CHECK: %[[SUBALLOCA1:.+]] = stream.resource.subview %[[ALLOCA]][%[[PACK]]#2] : !stream.resource{%[[PACK]]#0} -> !stream.resource{%[[SIZE1]]} + // CHECK: %[[EXECUTE_TIMEPOINT:.+]] = stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) + // CHECK-SAME: with(%[[SUBALLOCA0]] as %[[CAPTURE0:.+]]: !stream.resource{%[[SIZE0]]}, + // CHECK-SAME: %[[SUBALLOCA1]] as %[[CAPTURE1:.+]]: !stream.resource{%[[SIZE1]]}) %results:2, %result_timepoint = stream.async.execute with() -> (!stream.resource{%size0}, !stream.resource{%size1}) { // CHECK: stream.cmd.fill %c254_i32, %[[CAPTURE0]] %0 = stream.async.splat %c254_i32 : i32 -> !stream.resource{%size0} @@ -194,11 +200,11 @@ func.func @producedResults(%size0: index, %size1: index) { %1 = stream.async.splat %c255_i32 : i32 -> !stream.resource{%size1} stream.yield %0, %1 : !stream.resource{%size0}, !stream.resource{%size1} } => !stream.timepoint - // CHECK: util.optimization_barrier %[[TIMEPOINT]] + // CHECK: util.optimization_barrier %[[EXECUTE_TIMEPOINT]] util.optimization_barrier %result_timepoint : !stream.timepoint - // CHECK: util.optimization_barrier %[[ALLOC_RETS]]#0 + // CHECK: util.optimization_barrier %[[SUBALLOCA0]] util.optimization_barrier %results#0 : !stream.resource - // CHECK: util.optimization_barrier %[[ALLOC_RETS]]#1 + // CHECK: util.optimization_barrier %[[SUBALLOCA1]] util.optimization_barrier %results#1 : !stream.resource return } @@ -248,10 +254,10 @@ func.func @concurrentRegions(%operand: !stream.resource, %size: index %c128 = arith.constant 128 : index %c254_i32 = arith.constant 254 : i32 %c255_i32 = arith.constant 255 : i32 - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %results:2, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> (!stream.resource{%size}, !stream.resource{%size}) { // CHECK: stream.cmd.concurrent %0:2 = stream.async.concurrent with(%capture as %concurrent_capture: !stream.resource{%size}) -> (%capture as !stream.resource{%size}, !stream.resource{%size}) { @@ -265,7 +271,7 @@ func.func @concurrentRegions(%operand: !stream.resource, %size: index } => !stream.timepoint // CHECK: util.optimization_barrier %[[OPERAND]] util.optimization_barrier %results#0 : !stream.resource - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %results#1 : !stream.resource return } @@ -276,14 +282,15 @@ func.func @concurrentRegions(%operand: !stream.resource, %size: index // CHECK-SAME: (%[[SIZE:.+]]: index) func.func @applyAsyncSplatOp(%size: index) { %c255_i32 = arith.constant 255 : i32 - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: stream.cmd.execute with(%[[ALLOC]] as %[[CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) + // CHECK-SAME: with(%[[ALLOCA]] as %[[CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %result, %result_timepoint = stream.async.execute with() -> (!stream.resource{%size}) { // CHECK: stream.cmd.fill %c255_i32, %[[CAPTURE]][%c0 for %[[SIZE]]] : i32 -> !stream.resource{%[[SIZE]]} %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource{%size} stream.yield %0 : !stream.resource{%size} } => !stream.timepoint - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %result : !stream.resource return } @@ -293,17 +300,17 @@ func.func @applyAsyncSplatOp(%size: index) { // CHECK-LABEL: @applyAsyncCloneOp // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) func.func @applyAsyncCloneOp(%operand: !stream.resource, %size: index) { - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %result, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> !stream.resource{%size} { // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOC_CAPTURE]][%c0], %[[SIZE]] // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} %0 = stream.async.clone %capture : !stream.resource{%size} -> !stream.resource{%size} stream.yield %0 : !stream.resource{%size} } => !stream.timepoint - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %result : !stream.resource return } @@ -319,17 +326,17 @@ func.func @applyAsyncSliceOp(%operand: !stream.resource, %size: index %c16 = arith.constant 16 : index %c128 = arith.constant 128 : index %c144 = arith.constant 144 : index - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%c128} - // CHECK: stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%c128} + // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%c128}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%c128}) %result, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> !stream.resource{%c128} { // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c16], %[[ALLOC_CAPTURE]][%c0], %c128 // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%c128} %0 = stream.async.slice %capture[%c16 to %c144] : !stream.resource{%size} -> !stream.resource{%c128} stream.yield %0 : !stream.resource{%c128} } => !stream.timepoint - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %result : !stream.resource return } @@ -477,17 +484,17 @@ func.func @applyAsyncCollectiveOpOutOfPlace(%channel: !stream.channel, %send: !s // CHECK-LABEL: @applyAsyncTransferOp // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) func.func @applyAsyncTransferOp(%operand: !stream.resource, %size: index) { - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %result, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> !stream.resource{%size} { - // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOC_CAPTURE]][%c0], %[[SIZE]] + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]] // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} %0 = stream.async.transfer %capture : !stream.resource{%size} -> !stream.resource{%size} stream.yield %0 : !stream.resource{%size} } => !stream.timepoint - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %result : !stream.resource return } @@ -500,14 +507,14 @@ func.func @applyAsyncDispatchOp(%operand: !stream.resource, %size: in %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: %[[TIMEPOINT:.+]] = stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: %[[TIMEPOINT:.+]] = stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %results:2, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> (%operand as !stream.resource{%size}, !stream.resource{%size}) { // CHECK-NEXT: stream.cmd.dispatch @executable::@dispatch[%c1, %c1, %c1](%c4 : index) { // CHECK-NEXT: rw %[[OPERAND_CAPTURE]][%[[OFFSET]] for %[[LENGTH]]] : !stream.resource{%[[SIZE]]}, - // CHECK-NEXT: wo %[[ALLOC_CAPTURE]][%c0{{[_0-9]*}} for %[[SIZE]]] : !stream.resource{%[[SIZE]]} + // CHECK-NEXT: wo %[[ALLOCA_CAPTURE]][%c0{{[_0-9]*}} for %[[SIZE]]] : !stream.resource{%[[SIZE]]} // CHECK-NEXT: } %0:2 = stream.async.dispatch @executable::@dispatch[%c1, %c1, %c1](%capture[%offset to %end for %length], %c4) : (!stream.resource{%size}, index) -> (%capture{%size}, !stream.resource{%size}) stream.yield %0#0, %0#1 : !stream.resource{%size}, !stream.resource{%size} @@ -516,7 +523,7 @@ func.func @applyAsyncDispatchOp(%operand: !stream.resource, %size: in util.optimization_barrier %result_timepoint : !stream.timepoint // CHECK: util.optimization_barrier %[[OPERAND]] util.optimization_barrier %results#0 : !stream.resource - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %results#1 : !stream.resource return } @@ -571,12 +578,12 @@ func.func @applyAsyncCallOp(%operand: !stream.resource, %size: index, %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c4 = arith.constant 4 : index - // CHECK: %[[ALLOC:.+]] = stream.resource.alloc uninitialized : !stream.resource{%[[SIZE]]} - // CHECK: %[[TIMEPOINT:.+]] = stream.cmd.execute + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized : !stream.resource{%[[SIZE]]} + // CHECK: %[[TIMEPOINT:.+]] = stream.cmd.execute await(%[[ALLOCA_TIMEPOINT]]) // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, - // CHECK-SAME: %[[ALLOC]] as %[[ALLOC_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) %results:2, %result_timepoint = stream.async.execute with(%operand as %capture: !stream.resource{%size}) -> (%operand as !stream.resource{%size}, !stream.resource{%size}) { - // CHECK-NEXT: stream.cmd.call @asyncExtern(rw %[[OPERAND_CAPTURE]][%[[OFFSET]] for %[[LENGTH]]], %c4, wo %[[ALLOC_CAPTURE]][%c0{{[_0-9]*}} for %[[SIZE]]]) : + // CHECK-NEXT: stream.cmd.call @asyncExtern(rw %[[OPERAND_CAPTURE]][%[[OFFSET]] for %[[LENGTH]]], %c4, wo %[[ALLOCA_CAPTURE]][%c0{{[_0-9]*}} for %[[SIZE]]]) : // CHECK-SAME: (!stream.resource{%[[SIZE]]}, index, !stream.resource{%[[SIZE]]}) -> () %0:2 = stream.async.call @asyncExtern(%capture[%offset to %end for %length], %c4) : (!stream.resource{%size}, index) -> (%capture{%size}, !stream.resource{%size}) stream.yield %0#0, %0#1 : !stream.resource{%size}, !stream.resource{%size} @@ -585,7 +592,7 @@ func.func @applyAsyncCallOp(%operand: !stream.resource, %size: index, util.optimization_barrier %result_timepoint : !stream.timepoint // CHECK: util.optimization_barrier %[[OPERAND]] util.optimization_barrier %results#0 : !stream.resource - // CHECK: util.optimization_barrier %[[ALLOC]] + // CHECK: util.optimization_barrier %[[ALLOCA]] util.optimization_barrier %results#1 : !stream.resource return } diff --git a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp index 5db024976658..fda46c464872 100644 --- a/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp +++ b/compiler/src/iree/compiler/Modules/HAL/Inline/Conversion/StreamToHALInline/Patterns.cpp @@ -70,16 +70,11 @@ struct ResourceAllocOpPattern Value minAlignment = rewriter.create(allocOp.getLoc(), 64); - SmallVector results; - for (auto [resourceResult, storageSize] : - llvm::zip_equal(allocOp.getResults(), allocOp.getStorageSizes())) { - auto allocateOp = rewriter.create( - allocOp.getLoc(), deviceBufferType, hostBufferType, minAlignment, - storageSize); - results.push_back(allocateOp.getResult()); - } + auto allocateOp = rewriter.create( + allocOp.getLoc(), deviceBufferType, hostBufferType, minAlignment, + adaptor.getStorageSize()); + rewriter.replaceOp(allocOp, allocateOp.getResult()); - rewriter.replaceOp(allocOp, results); return success(); } }; diff --git a/experimental/cuda2/cuda_buffer.c b/experimental/cuda2/cuda_buffer.c index ff5a254e5d25..d1d017fed202 100644 --- a/experimental/cuda2/cuda_buffer.c +++ b/experimental/cuda2/cuda_buffer.c @@ -43,6 +43,13 @@ iree_status_t iree_hal_cuda2_buffer_wrap( void* host_ptr, iree_hal_buffer_release_callback_t release_callback, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(out_buffer); + if (!host_ptr && iree_any_bit_set(allowed_usage, + IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT | + IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "mappable buffers require host pointers"); + } + IREE_TRACE_ZONE_BEGIN(z0); iree_hal_cuda2_buffer_t* buffer = NULL; @@ -95,6 +102,7 @@ static iree_status_t iree_hal_cuda2_buffer_map_range( ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + IREE_ASSERT(buffer->host_ptr, "mappable buffers require host pointers"); uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset; // If we mapped for discard scribble over the bytes. This is not a mandated // behavior but it will make debugging issues easier. Alternatively for diff --git a/experimental/cuda2/cuda_device.c b/experimental/cuda2/cuda_device.c index 660fa1f823ea..b53bcd00f667 100644 --- a/experimental/cuda2/cuda_device.c +++ b/experimental/cuda2/cuda_device.c @@ -623,7 +623,7 @@ static iree_status_t iree_hal_cuda2_device_queue_alloca( // allocator is set on the device. iree_status_t status = iree_ok_status(); if (device->supports_memory_pools && - !iree_any_bit_set(params.access, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + !iree_any_bit_set(params.type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { status = iree_hal_cuda2_memory_pools_alloca( &device->memory_pools, device->dispatch_cu_stream, pool, params, allocation_size, out_buffer); diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c index bcb1ad742536..f9f33b4f1068 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c @@ -43,6 +43,13 @@ iree_status_t iree_hal_cuda_buffer_wrap( void* host_ptr, iree_hal_buffer_release_callback_t release_callback, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(out_buffer); + if (!host_ptr && iree_any_bit_set(allowed_usage, + IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT | + IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)) { + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "mappable buffers require host pointers"); + } + IREE_TRACE_ZONE_BEGIN(z0); iree_hal_cuda_buffer_t* buffer = NULL; @@ -93,6 +100,7 @@ static iree_status_t iree_hal_cuda_buffer_map_range( ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + IREE_ASSERT(buffer->host_ptr, "mappable buffers require host pointers"); uint8_t* data_ptr = (uint8_t*)(buffer->host_ptr) + local_byte_offset; // If we mapped for discard scribble over the bytes. This is not a mandated // behavior but it will make debugging issues easier. Alternatively for diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c index cd6b3e4125f8..4aaba55ed5e1 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c @@ -560,7 +560,7 @@ static iree_status_t iree_hal_cuda_device_queue_alloca( // allocator is set on the device. iree_status_t status = iree_ok_status(); if (device->supports_memory_pools && - !iree_any_bit_set(params.access, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { + !iree_any_bit_set(params.type, IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)) { status = iree_hal_cuda_memory_pools_alloca(&device->memory_pools, device->stream, pool, params, allocation_size, out_buffer);