diff --git a/include/hc.hpp b/include/hc.hpp
index 39ad5eed7b6..e29f6d737c6 100644
--- a/include/hc.hpp
+++ b/include/hc.hpp
@@ -247,15 +247,15 @@ class accelerator_view {
      * is marked ready.   Thus, markers provide a mechanism to enforce order between
      * commands in an execute_any_order accelerator_view.
      *
-     * release_scope controls the scope of the release fence applied after the marker executes.  Options are:
-     *   - no_scope : No release fence operation is performed.
-     *   - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
-     *   - system_scope: Memory is released to system scope (all accelerators including CPUs)
+     * fence_scope controls the scope of the acquire and release fences applied after the marker executes.  Options are:
+     *   - no_scope : No fence operation is performed.
+     *   - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
+     *   - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
      *
      * @return A future which can be waited on, and will block until the
      *         current batch of commands has completed.
      */
-    completion_future create_marker(memory_scope release_scope=system_scope) const;
+    completion_future create_marker(memory_scope fence_scope=system_scope) const;
 
     /**
      * This command inserts a marker event into the accelerator_view's command
@@ -270,10 +270,10 @@ class accelerator_view {
      * is marked ready.   Thus, markers provide a mechanism to enforce order between
      * commands in an execute_any_order accelerator_view.
      *
-     * release_scope controls the scope of the release fence applied after the marker executes.  Options are:
-     *   - no_scope : No release fence operation is performed.
-     *   - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
-     *   - system_scope: Memory is released to system scope (all accelerators including CPUs)
+     * fence_scope controls the scope of the acquire and release fences applied after the marker executes.  Options are:
+     *   - no_scope : No fence operation is performed.
+     *   - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
+     *   - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
      *
      * dependent_futures may be recorded in another queue or another accelerator.  If in another accelerator,
      * the runtime performs cross-accelerator sychronization.  
@@ -282,7 +282,7 @@ class accelerator_view {
      *         current batch of commands, plus the dependent event have
      *         been completed.
      */
-    completion_future create_blocking_marker(completion_future& dependent_future, memory_scope release_scope=system_scope) const;
+    completion_future create_blocking_marker(completion_future& dependent_future, memory_scope fence_scope=system_scope) const;
 
     /**
      * This command inserts a marker event into the accelerator_view's command
@@ -297,16 +297,16 @@ class accelerator_view {
      * is marked ready.   Thus, markers provide a mechanism to enforce order between
      * commands in an execute_any_order accelerator_view.
      *
-     * release_scope controls the scope of the release fence applied after the marker executes.  Options are:
-     *   - no_scope : No release fence operation is performed.
-     *   - accelerator_scope: Memory is released to the accelerator scope where the marker executes.
-     *   - system_scope: Memory is released to system scope (all accelerators including CPUs)
+     * fence_scope controls the scope of the acquire and release fences applied after the marker executes.  Options are:
+     *   - no_scope : No fence operation is performed.
+     *   - accelerator_scope: Memory is acquired from and released to the accelerator scope where the marker executes.
+     *   - system_scope: Memory is acquired from and released to system scope (all accelerators including CPUs)
      *
      * @return A future which can be waited on, and will block until the
      *         current batch of commands, plus the dependent event have
      *         been completed.
      */
-    completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope release_scope=system_scope) const;
+    completion_future create_blocking_marker(std::initializer_list<completion_future> dependent_future_list, memory_scope fence_scope=system_scope) const;
 
 
     /**
diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 865aa8c1d79..3d01665987f 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -784,6 +784,7 @@ class HSADispatch : public HSAOp {
     }
 
 
+	void overrideAcquireFenceIfNeeded();
     hsa_status_t setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims,
                                      int dynamicGroupSize);
 
@@ -1755,11 +1756,11 @@ class HSAQueue final : public KalmarQueue
     // depOps specifies the other ops that this marker will depend on.  These
     // can be in any queue on any GPU .
     //
-    // releaseScope specifies the scope of the release fence that will be
+    // fenceScope specifies the scope of the acquire and release fence that will be
     // applied after the marker executes.  See hc::memory_scope
     std::shared_ptr<KalmarAsyncOp> EnqueueMarkerWithDependency(int count,
             std::shared_ptr <KalmarAsyncOp> *depOps,
-            hc::memory_scope releaseScope) override {
+            hc::memory_scope fenceScope) override {
 
         hsa_status_t status = HSA_STATUS_SUCCESS;
 
@@ -1804,7 +1805,7 @@ class HSAQueue final : public KalmarQueue
             }
 
             // enqueue the barrier
-            status = barrier.get()->enqueueAsync(releaseScope);
+            status = barrier.get()->enqueueAsync(fenceScope);
             STATUS_CHECK(status, __LINE__);
 
 
@@ -3814,6 +3815,10 @@ HSAQueue::dispatch_hsa_kernel(const hsa_kernel_dispatch_packet_t *aql,
     Kalmar::HSADevice* device = static_cast<Kalmar::HSADevice*>(this->getDev());
     //HSADispatch *dispatch = new HSADispatch(device, nullptr, aql);
     std::shared_ptr<HSADispatch> sp_dispatch = std::make_shared<HSADispatch>(device, this/*queue*/, nullptr, aql);
+    if (HCC_OPT_FLUSH) {
+		sp_dispatch->overrideAcquireFenceIfNeeded();
+    }
+
     pushAsyncOp(sp_dispatch);
     HSADispatch *dispatch = sp_dispatch.get();
     dispatch->setKernelName(kernelName);
@@ -4228,6 +4233,15 @@ HSADispatch::getEndTimestamp() override {
     return time.end;
 }
 
+void HSADispatch::overrideAcquireFenceIfNeeded()
+{
+    if (hsaQueue()->nextKernelNeedsSysAcquire())  {
+       DBOUT( DB_CMD2, "  kernel AQL packet adding system-scope acquire\n");
+       // Pick up system acquire if needed.
+       aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ;
+       hsaQueue()->setNextKernelNeedsSysAcquire(false);
+    } 
+}
 
 inline hsa_status_t
 HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localDims,
@@ -4302,15 +4316,9 @@ HSADispatch::setLaunchConfiguration(int dims, size_t *globalDims, size_t *localD
 
     aql.header = 0;
     if (HCC_OPT_FLUSH) {
-        if (hsaQueue()->nextKernelNeedsSysAcquire())  {
-            DBOUT( DB_CMD2, "  kernel AQL packet adding system-scope acquire\n");
-            // Pick up system acquire if needed.
-            aql.header |= ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) ;
-            hsaQueue()->setNextKernelNeedsSysAcquire(false);
-        } else {
-            aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE);
-        }
-        aql.header |= ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+        aql.header = ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+                     ((HSA_FENCE_SCOPE_AGENT) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+		overrideAcquireFenceIfNeeded();
     } else {
         aql.header = ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
                      ((HSA_FENCE_SCOPE_SYSTEM) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
@@ -4355,15 +4363,20 @@ HSABarrier::waitComplete() {
 
 // TODO - remove hsaQueue parm.
 inline hsa_status_t
-HSABarrier::enqueueAsync(hc::memory_scope releaseScope) {
+HSABarrier::enqueueAsync(hc::memory_scope fenceScope) {
 
     // extract hsa_queue_t from HSAQueue
     //
 
-    if (releaseScope == hc::system_scope) {
+    if (fenceScope == hc::system_scope) {
         hsaQueue()->setNextSyncNeedsSysRelease(false);
     };
 
+    if (fenceScope > _acquire_scope) {
+		DBOUT( DB_CMD2, "  marker overriding acquireScope(old:" << _acquire_scope << ") to match fenceScope = " << fenceScope);
+		_acquire_scope = fenceScope;
+    }
+
     // set acquire scope:
     unsigned fenceBits = 0;
 
@@ -4381,7 +4394,7 @@ HSABarrier::enqueueAsync(hc::memory_scope releaseScope) {
             STATUS_CHECK(HSA_STATUS_ERROR_INVALID_ARGUMENT, __LINE__);
     }
 
-    switch (releaseScope) {
+    switch (fenceScope) {
         case hc::no_scope:
             fenceBits |= ((HSA_FENCE_SCOPE_NONE) << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
             break;
@@ -4805,7 +4818,7 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::
         hsa_signal_t depSignal;
         setCommandKind (resolveMemcpyDirection(srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem));
 
-        auto releaseScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope;
+        auto fenceScope = (hsaQueue()->nextSyncNeedsSysRelease()) ? hc::system_scope : hc::no_scope;
         depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->detectStreamDeps(this->getCommandKind(), this));
 
         // We need to ensure the copy waits for preceding commands the HCC queue to complete, if those commands exist.
@@ -4820,11 +4833,11 @@ HSACopy::enqueueAsyncCopyCommand(const Kalmar::HSADevice *copyDevice, const hc::
             //      in streams that we depend on.
             // For both of these cases, we create an additional barrier packet in the source, and attach the desired fence.
             // Then we make the copy depend on the signal written by this command.
-            if ((depSignal.handle == 0x0) || (releaseScope != hc::no_scope)) {
+            if ((depSignal.handle == 0x0) || (fenceScope != hc::no_scope)) {
                 DBOUT( DB_CMD2, "  asyncCopy adding marker for needed dependency or release\n");
 
                 // Set depAsyncOp for use by the async copy below:
-                depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, releaseScope));
+                depAsyncOp = std::static_pointer_cast<HSAOp> (hsaQueue()->EnqueueMarkerWithDependency(0, nullptr, fenceScope));
                 depSignal = * (static_cast <hsa_signal_t*> (depAsyncOp->getNativeHandle()));
             };
 
diff --git a/lib/rpt b/lib/rpt
index 35a73554760..303c23a0587 100755
--- a/lib/rpt
+++ b/lib/rpt
@@ -65,7 +65,7 @@ parser.add_argument('--roi_stop',
 
 
 parser.add_argument('--topn', 
-        type=int, nargs=1,   # TODO - fix this so it is int
+        type=int, 
         default=20,
         help="print top N entries in summary.  Default=10.  -1=all")