From 06779af4e1bc9663d0dd3936fbfff15bf0c55d32 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Sat, 30 Sep 2017 15:30:52 +0000 Subject: [PATCH] Fix memory fence handling for barriers - release_scope now fence_scope and controls both acquire and release fence. - override aquire fence on dispatch_hsa_kernel path, if needed. --- include/hc.hpp | 30 +++++++++++++------------- lib/hsa/mcwamp_hsa.cpp | 49 ++++++++++++++++++++++++++---------------- lib/rpt | 2 +- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/include/hc.hpp b/include/hc.hpp index 39ad5eed7b6..e29f6d737c6 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -247,15 +247,15 @@ class accelerator_view { * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. * - * release_scope controls the scope of the release fence applied after the marker executes. Options are: - * - no_scope : No release fence operation is performed. - * - accelerator_scope: Memory is released to the accelerator scope where the marker executes. - * - system_scope: Memory is released to system scope (all accelerators including CPUs) + * fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are: + * - no_scope : No fence operation is performed. + * - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes. + * - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs) * * @return A future which can be waited on, and will block until the * current batch of commands has completed. */ - completion_future create_marker(memory_scope release_scope=system_scope) const; + completion_future create_marker(memory_scope fence_scope=system_scope) const; /** * This command inserts a marker event into the accelerator_view's command @@ -270,10 +270,10 @@ class accelerator_view { * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. * - * release_scope controls the scope of the release fence applied after the marker executes. Options are: - * - no_scope : No release fence operation is performed. - * - accelerator_scope: Memory is released to the accelerator scope where the marker executes. - * - system_scope: Memory is released to system scope (all accelerators including CPUs) + * fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are: + * - no_scope : No fence operation is performed. + * - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes. + * - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs) * * dependent_futures may be recorded in another queue or another accelerator. If in another accelerator, * the runtime performs cross-accelerator sychronization. @@ -282,7 +282,7 @@ class accelerator_view { * current batch of commands, plus the dependent event have * been completed. */ - completion_future create_blocking_marker(completion_future& dependent_future, memory_scope release_scope=system_scope) const; + completion_future create_blocking_marker(completion_future& dependent_future, memory_scope fence_scope=system_scope) const; /** * This command inserts a marker event into the accelerator_view's command @@ -297,16 +297,16 @@ class accelerator_view { * is marked ready. Thus, markers provide a mechanism to enforce order between * commands in an execute_any_order accelerator_view. * - * release_scope controls the scope of the release fence applied after the marker executes. Options are: - * - no_scope : No release fence operation is performed. - * - accelerator_scope: Memory is released to the accelerator scope where the marker executes. - * - system_scope: Memory is released to system scope (all accelerators including CPUs) + * fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are: + * - no_scope : No fence operation is performed. + * - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes. + * - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs) * * @return A future which can be waited on, and will block until the * current batch of commands, plus the dependent event have * been completed. */ - completion_future create_blocking_marker(std::initializer_list dependent_future_list, memory_scope release_scope=system_scope) const; + completion_future create_blocking_marker(std::initializer_list dependent_future_list, memory_scope fence_scope=system_scope) const; /** diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp index 865aa8c1d79..3d01665987f 100644 --- a/lib/hsa/mcwamp_hsa.cpp +++ b/lib/hsa/mcwamp_hsa.cpp @@ -784,6 +784,7 @@ class HSADispatch : public HSAOp { } + void overrideAcquireFenceIfNeeded(); hsa_status_t setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims, int dynamicGroupSize); @@ -1755,11 +1756,11 @@ class HSAQueue final : public KalmarQueue // depOps specifies the other ops that this marker will depend on. These // can be in any queue on any GPU . // - // releaseScope specifies the scope of the release fence that will be + // fenceScope specifies the scope of the acquire and release fence that will be // applied after the marker executes. See hc::memory_scope std::shared_ptr EnqueueMarkerWithDependency(int count, std::shared_ptr *depOps, - hc::memory_scope releaseScope) override { + hc::memory_scope fenceScope) override { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -1804,7 +1805,7 @@ class HSAQueue final : public KalmarQueue } // enqueue the barrier - status = barrier.get()->enqueueAsync(releaseScope); + status = barrier.get()->enqueueAsync(fenceScope); STATUS_CHECK(status, __LINE__); @@ -3814,6 +3815,10 @@ HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql, Kalmar::HSADevice* device = static_cast(this->getDev()); //HSADispatch *dispatch = new HSADispatch(device, nullptr, aql); std::shared_ptr sp_dispatch = std::make_shared(device, this/*queue*/, nullptr, aql); + if (HCC_OPT_FLUSH) { + sp_dispatch->overrideAcquireFenceIfNeeded(); + } + pushAsyncOp(sp_dispatch); HSADispatch *dispatch = sp_dispatch.get(); dispatch->setKernelName(kernelName); @@ -4228,6 +4233,15 @@ HSADispatch::getEndTimestamp() override { return time.end; } +void HSADispatch::overrideAcquireFenceIfNeeded() +{ + if (hsaQueue()->nextKernelNeedsSysAcquire()) { + DBOUT( DB_CMD2, " kernel AQL packet adding system-scope acquire\n"); + // Pick up system acquire if needed. + aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ; + hsaQueue()->setNextKernelNeedsSysAcquire(false); + } +} inline hsa_status_t HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims, @@ -4302,15 +4316,9 @@ HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localD aql.header = 0; if (HCC_OPT_FLUSH) { - if (hsaQueue()->nextKernelNeedsSysAcquire()) { - DBOUT( DB_CMD2, " kernel AQL packet adding system-scope acquire\n"); - // Pick up system acquire if needed. - aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ; - hsaQueue()->setNextKernelNeedsSysAcquire(false); - } else { - aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE); - } - aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + aql.header = ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | + ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); + overrideAcquireFenceIfNeeded(); } else { aql.header = ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); @@ -4355,15 +4363,20 @@ HSABarrier::waitComplete() { // TODO - remove hsaQueue parm. inline hsa_status_t -HSABarrier::enqueueAsync(hc::memory_scope releaseScope) { +HSABarrier::enqueueAsync(hc::memory_scope fenceScope) { // extract hsa_queue_t from HSAQueue // - if (releaseScope == hc::system_scope) { + if (fenceScope == hc::system_scope) { hsaQueue()->setNextSyncNeedsSysRelease(false); }; + if (fenceScope > _acquire_scope) { + DBOUT( DB_CMD2, " marker overriding acquireScope(old:" << _acquire_scope << ") to match fenceScope = " << fenceScope); + _acquire_scope = fenceScope; + } + // set acquire scope: unsigned fenceBits = 0; @@ -4381,7 +4394,7 @@ HSABarrier::enqueueAsync(hc::memory_scope releaseScope) { STATUS_CHECK(HSA_STATUS_ERROR_INVALID_ARGUMENT, __LINE__); } - switch (releaseScope) { + switch (fenceScope) { case hc::no_scope: fenceBits |= ((HSA_FENCE_SCOPE_NONE) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); break; @@ -4805,7 +4818,7 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc:: hsa_signal_t depSignal; setCommandKind (resolveMemcpyDirection(srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem)); - auto releaseScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope; + auto fenceScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope; depAsyncOp = std::static_pointer_cast (hsaQueue()->detectStreamDeps(this->getCommandKind(), this)); // We need to ensure the copy waits for preceding commands the HCC queue to complete, if those commands exist. @@ -4820,11 +4833,11 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc:: // in streams that we depend on. // For both of these cases, we create an additional barrier packet in the source, and attach the desired fence. // Then we make the copy depend on the signal written by this command. - if ((depSignal.handle == 0x0) || (releaseScope != hc::no_scope)) { + if ((depSignal.handle == 0x0) || (fenceScope != hc::no_scope)) { DBOUT( DB_CMD2, " asyncCopy adding marker for needed dependency or release\n"); // Set depAsyncOp for use by the async copy below: - depAsyncOp = std::static_pointer_cast (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, releaseScope)); + depAsyncOp = std::static_pointer_cast (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, fenceScope)); depSignal = * (static_cast (depAsyncOp->getNativeHandle())); }; diff --git a/lib/rpt b/lib/rpt index 35a73554760..303c23a0587 100755 --- a/lib/rpt +++ b/lib/rpt @@ -65,7 +65,7 @@ parser.add_argument('--roi_stop', parser.add_argument('--topn', - type=int, nargs=1, # TODO - fix this so it is int + type=int, default=20, help="print top N entries in summary. Default=10. -1=all")