diff --git a/lib/hsa/mcwamp_hsa.cpp b/lib/hsa/mcwamp_hsa.cpp
index 6d902203616..b3b30621d1a 100644
--- a/lib/hsa/mcwamp_hsa.cpp
+++ b/lib/hsa/mcwamp_hsa.cpp
@@ -61,7 +61,10 @@
 // kernel dispatch speed optimization flags
 /////////////////////////////////////////////////
 
-// size of default kernarg buffer in the kernarg pool in HSAContext
+// Size of default kernarg buffer in the kernarg pool in HSAContext, in bytes.
+// Increased from 512 to 4k to match CUDA default. See
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#function-parameters
+// When this size is exceeded, on-demand allocation of the kernarg buffer is slow.
 #define KERNARG_BUFFER_SIZE (4096)
 
 // number of pre-allocated kernarg buffers in HSAContext
@@ -75,7 +78,9 @@
 // MUST be a power of 2.
 #define MAX_INFLIGHT_COMMANDS_PER_QUEUE  (2*8192)
 
-// threshold to clean up finished kernel in HSAQueue.asyncOps
+// Threshold to clean up finished kernel in HSAQueue.asyncOps.
+// Reduced from 16k to 1k at the same time when the HCC_KERNARG_BUFFER_SIZE
+// was increased, in order to offset the increase in memory pressure.
 int HCC_ASYNCOPS_SIZE = (1024);