Skip to content

Commit

Permalink
Merge pull request #477 from RadeonOpenCompute/fix_event_fence
Browse files Browse the repository at this point in the history
Fix memory fence handling for barriers
  • Loading branch information
whchung authored Oct 1, 2017
2 parents 31bd221 + 06779af commit 3c314ed
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 34 deletions.
30 changes: 15 additions & 15 deletions include/hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,15 @@ class accelerator_view {
* is marked ready. Thus, markers provide a mechanism to enforce order between
* commands in an execute_any_order accelerator_view.
*
* release_scope controls the scope of the release fence applied after the marker executes. Options are:
* - no_scope : No release fence operation is performed.
* - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
* - system_scope: Memory is released to system scope (all accelerators including CPUs)
* fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are:
* - no_scope : No fence operation is performed.
* - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
* - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
*
* @return A future which can be waited on, and will block until the
* current batch of commands has completed.
*/
completion_future create_marker(memory_scope release_scope=system_scope) const;
completion_future create_marker(memory_scope fence_scope=system_scope) const;

/**
* This command inserts a marker event into the accelerator_view's command
Expand All @@ -270,10 +270,10 @@ class accelerator_view {
* is marked ready. Thus, markers provide a mechanism to enforce order between
* commands in an execute_any_order accelerator_view.
*
* release_scope controls the scope of the release fence applied after the marker executes. Options are:
* - no_scope : No release fence operation is performed.
* - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
* - system_scope: Memory is released to system scope (all accelerators including CPUs)
* fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are:
* - no_scope : No fence operation is performed.
* - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
* - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
*
* dependent_futures may be recorded in another queue or another accelerator. If in another accelerator,
* the runtime performs cross-accelerator sychronization.
Expand All @@ -282,7 +282,7 @@ class accelerator_view {
* current batch of commands, plus the dependent event have
* been completed.
*/
completion_future create_blocking_marker(completion_future& dependent_future, memory_scope release_scope=system_scope) const;
completion_future create_blocking_marker(completion_future& dependent_future, memory_scope fence_scope=system_scope) const;

/**
* This command inserts a marker event into the accelerator_view's command
Expand All @@ -297,16 +297,16 @@ class accelerator_view {
* is marked ready. Thus, markers provide a mechanism to enforce order between
* commands in an execute_any_order accelerator_view.
*
* release_scope controls the scope of the release fence applied after the marker executes. Options are:
* - no_scope : No release fence operation is performed.
* - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
* - system_scope: Memory is released to system scope (all accelerators including CPUs)
* fence_scope controls the scope of the acquire and release fences applied after the marker executes. Options are:
* - no_scope : No fence operation is performed.
* - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
* - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
*
* @return A future which can be waited on, and will block until the
* current batch of commands, plus the dependent event have
* been completed.
*/
completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope release_scope=system_scope) const;
completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope) const;


/**
Expand Down
49 changes: 31 additions & 18 deletions lib/hsa/mcwamp_hsa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,7 @@ class HSADispatch : public HSAOp {
}


void overrideAcquireFenceIfNeeded();
hsa_status_t setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims,
int dynamicGroupSize);

Expand Down Expand Up @@ -1755,11 +1756,11 @@ class HSAQueue final : public KalmarQueue
// depOps specifies the other ops that this marker will depend on. These
// can be in any queue on any GPU .
//
// releaseScope specifies the scope of the release fence that will be
// fenceScope specifies the scope of the acquire and release fence that will be
// applied after the marker executes. See hc::memory_scope
std::shared_ptr<KalmarAsyncOp> EnqueueMarkerWithDependency(int count,
std::shared_ptr <KalmarAsyncOp> *depOps,
hc::memory_scope releaseScope) override {
hc::memory_scope fenceScope) override {

hsa_status_t status = HSA_STATUS_SUCCESS;

Expand Down Expand Up @@ -1804,7 +1805,7 @@ class HSAQueue final : public KalmarQueue
}

// enqueue the barrier
status = barrier.get()->enqueueAsync(releaseScope);
status = barrier.get()->enqueueAsync(fenceScope);
STATUS_CHECK(status, __LINE__);


Expand Down Expand Up @@ -3814,6 +3815,10 @@ HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
Kalmar::HSADevice* device = static_cast<Kalmar::HSADevice*>(this->getDev());
//HSADispatch *dispatch = new HSADispatch(device, nullptr, aql);
std::shared_ptr<HSADispatch> sp_dispatch = std::make_shared<HSADispatch>(device, this/*queue*/, nullptr, aql);
if (HCC_OPT_FLUSH) {
sp_dispatch->overrideAcquireFenceIfNeeded();
}

pushAsyncOp(sp_dispatch);
HSADispatch *dispatch = sp_dispatch.get();
dispatch->setKernelName(kernelName);
Expand Down Expand Up @@ -4228,6 +4233,15 @@ HSADispatch::getEndTimestamp() override {
return time.end;
}

void HSADispatch::overrideAcquireFenceIfNeeded()
{
if (hsaQueue()->nextKernelNeedsSysAcquire()) {
DBOUT( DB_CMD2, " kernel AQL packet adding system-scope acquire\n");
// Pick up system acquire if needed.
aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ;
hsaQueue()->setNextKernelNeedsSysAcquire(false);
}
}

inline hsa_status_t
HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims,
Expand Down Expand Up @@ -4302,15 +4316,9 @@ HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localD

aql.header = 0;
if (HCC_OPT_FLUSH) {
if (hsaQueue()->nextKernelNeedsSysAcquire()) {
DBOUT( DB_CMD2, " kernel AQL packet adding system-scope acquire\n");
// Pick up system acquire if needed.
aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ;
hsaQueue()->setNextKernelNeedsSysAcquire(false);
} else {
aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
}
aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
aql.header = ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
overrideAcquireFenceIfNeeded();
} else {
aql.header = ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
Expand Down Expand Up @@ -4355,15 +4363,20 @@ HSABarrier::waitComplete() {

// TODO - remove hsaQueue parm.
inline hsa_status_t
HSABarrier::enqueueAsync(hc::memory_scope releaseScope) {
HSABarrier::enqueueAsync(hc::memory_scope fenceScope) {

// extract hsa_queue_t from HSAQueue
//

if (releaseScope == hc::system_scope) {
if (fenceScope == hc::system_scope) {
hsaQueue()->setNextSyncNeedsSysRelease(false);
};

if (fenceScope > _acquire_scope) {
DBOUT( DB_CMD2, " marker overriding acquireScope(old:" << _acquire_scope << ") to match fenceScope = " << fenceScope);
_acquire_scope = fenceScope;
}

// set acquire scope:
unsigned fenceBits = 0;

Expand All @@ -4381,7 +4394,7 @@ HSABarrier::enqueueAsync(hc::memory_scope releaseScope) {
STATUS_CHECK(HSA_STATUS_ERROR_INVALID_ARGUMENT, __LINE__);
}

switch (releaseScope) {
switch (fenceScope) {
case hc::no_scope:
fenceBits |= ((HSA_FENCE_SCOPE_NONE) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
break;
Expand Down Expand Up @@ -4805,7 +4818,7 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::
hsa_signal_t depSignal;
setCommandKind (resolveMemcpyDirection(srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem));

auto releaseScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope;
auto fenceScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope;
depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->detectStreamDeps(this->getCommandKind(), this));

// We need to ensure the copy waits for preceding commands the HCC queue to complete, if those commands exist.
Expand All @@ -4820,11 +4833,11 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::
// in streams that we depend on.
// For both of these cases, we create an additional barrier packet in the source, and attach the desired fence.
// Then we make the copy depend on the signal written by this command.
if ((depSignal.handle == 0x0) || (releaseScope != hc::no_scope)) {
if ((depSignal.handle == 0x0) || (fenceScope != hc::no_scope)) {
DBOUT( DB_CMD2, " asyncCopy adding marker for needed dependency or release\n");

// Set depAsyncOp for use by the async copy below:
depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, releaseScope));
depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, fenceScope));
depSignal = * (static_cast <hsa_signal_t*> (depAsyncOp->getNativeHandle()));
};

Expand Down
2 changes: 1 addition & 1 deletion lib/rpt
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ parser.add_argument('--roi_stop',


parser.add_argument('--topn',
type=int, nargs=1, # TODO - fix this so it is int
type=int,
default=20,
help="print top N entries in summary. Default=10. -1=all")

Expand Down

0 comments on commit 3c314ed

Please sign in to comment.