WebAssembly · tlively · Feb 4, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/src/passes/ConstantFieldPropagation.cpp b/src/passes/ConstantFieldPropagation.cpp
@@ -143,9 +143,11 @@ struct FunctionOptimizer : public WalkerPass<PostWalker<FunctionOptimizer>> {
   // If an optimized access is sequentially consistent, then it synchronizes
   // with other threads at least by participating in the global order of
   // sequentially consistent operations. Preserve that effect by replacing the
-  // access with a fence.
+  // access with a fence. On the other hand, if we're optimizing an
+  // acquire-release operation, then we know the accessed field is constant and
+  // will not be modified, so the operation does not necessarily synchronize
+  // with other threads and no fence is required.
   Block* maybeAddFence(Block* block, MemoryOrder order) {
-    assert(order != MemoryOrder::AcqRel);
     if (order == MemoryOrder::SeqCst) {
       block->list.push_back(Builder(*getModule()).makeAtomicFence());
     }
@@ -214,14 +216,6 @@ struct FunctionOptimizer : public WalkerPass<PostWalker<FunctionOptimizer>> {
       return;
     }
 
-    if (curr->order == MemoryOrder::AcqRel) {
-      // Removing an acquire get and preserving its synchronization properties
-      // would require inserting an acquire fence, but the fence would have
-      // stronger synchronization properties so might be more expensive.
-      // Instead, just skip the optimization.
-      return;
-    }
-
     // If the value is not a constant, then it is unknown and we must give up
     // on simply applying a constant. However, we can try to use a ref.test, if
     // that is allowed.
@@ -259,11 +253,6 @@ struct FunctionOptimizer : public WalkerPass<PostWalker<FunctionOptimizer>> {
     PossibleConstantValues info = getInfo(heapType, curr->index);
     assert(info.hasNoted() && "unexpected lack of info for RMW");
 
-    if (curr->order == MemoryOrder::AcqRel) {
-      // See comment on visitStructGet for why we don't optimize here.
-      return std::nullopt;
-    }
-
     if (!info.isConstant()) {
       // Optimizing using ref.test is not an option here because that only works
       // on immutable fields, but RMW operations always access mutable fields.

diff --git a/test/lit/passes/cfp-rmw.wast b/test/lit/passes/cfp-rmw.wast
@@ -107,14 +107,19 @@
   )
 
   ;; CHECK:      (func $rmw-xchg-acqrel (type $1) (param $0 (ref $A)) (result i32)
-  ;; CHECK-NEXT:  (struct.atomic.rmw.xchg acqrel acqrel $A 0
-  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (ref.as_non_null
+  ;; CHECK-NEXT:    (local.get $0)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
   ;; CHECK-NEXT:   (i32.const 0)
   ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.const 0)
   ;; CHECK-NEXT: )
   (func $rmw-xchg-acqrel (param (ref $A)) (result i32)
-    ;; Making the accesses acqrel instead of seqcst means that the replacement
-    ;; fence could be more expensive than the original op, so we don't optimize.
+    ;; Making the accesses acqrel instead of seqcst means that we don't need a
+    ;; fence when we optimize.
     (struct.atomic.rmw.xchg acqrel acqrel $A 0
       (local.get 0)
       (i32.const 0)
@@ -220,16 +225,26 @@
   )
 
   ;; CHECK:      (func $rmw-xchg-copy-acqrel (type $1) (param $0 (ref $A)) (param $1 (ref $A)) (result i32)
-  ;; CHECK-NEXT:  (struct.atomic.rmw.xchg acqrel acqrel $A 0
-  ;; CHECK-NEXT:   (local.get $0)
-  ;; CHECK-NEXT:   (struct.atomic.get acqrel $A 0
-  ;; CHECK-NEXT:    (local.get $1)
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (ref.as_non_null
+  ;; CHECK-NEXT:    (local.get $0)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (block (result i32)
+  ;; CHECK-NEXT:    (drop
+  ;; CHECK-NEXT:     (ref.as_non_null
+  ;; CHECK-NEXT:      (local.get $1)
+  ;; CHECK-NEXT:     )
+  ;; CHECK-NEXT:    )
+  ;; CHECK-NEXT:    (i32.const 0)
   ;; CHECK-NEXT:   )
   ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.const 0)
   ;; CHECK-NEXT: )
   (func $rmw-xchg-copy-acqrel (param (ref $A) (ref $A)) (result i32)
-    ;; Making the accesses acqrel instead of seqcst means that the replacement
-    ;; fence could be more expensive than the original op, so we don't optimize.
+    ;; Making the accesses acqrel instead of seqcst means that we don't need a
+    ;; fence when we optimize.
     (struct.atomic.rmw.xchg acqrel acqrel $A 0
       (local.get 0)
       (struct.atomic.get acqrel $A 0
@@ -511,15 +526,22 @@
   )
 
   ;; CHECK:      (func $rmw-cmpxchg-acqrel (type $1) (param $0 (ref $A)) (param $1 i32) (result i32)
-  ;; CHECK-NEXT:  (struct.atomic.rmw.cmpxchg acqrel acqrel $A 0
-  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (ref.as_non_null
+  ;; CHECK-NEXT:    (local.get $0)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
   ;; CHECK-NEXT:   (local.get $1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
   ;; CHECK-NEXT:   (i32.const 0)
   ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.const 0)
   ;; CHECK-NEXT: )
   (func $rmw-cmpxchg-acqrel (param (ref $A) i32) (result i32)
-    ;; Making the accesses acqrel instead of seqcst means that the replacement
-    ;; fence could be more expensive than the original op, so we don't optimize.
+    ;; Acqrel accesses to constant fields do not synchronize with anything, so
+    ;; we can optimize without fences.
     (struct.atomic.rmw.cmpxchg acqrel acqrel $A 0
       (local.get 0)
       (local.get 1)
@@ -626,15 +648,22 @@
   )
 
   ;; CHECK:      (func $rmw-cmpxchg-acqrel (type $1) (param $0 (ref $A)) (param $1 i32) (result i32)
-  ;; CHECK-NEXT:  (struct.atomic.rmw.cmpxchg acqrel acqrel $A 0
-  ;; CHECK-NEXT:   (local.get $0)
+  ;; CHECK-NEXT:  (drop
+  ;; CHECK-NEXT:   (ref.as_non_null
+  ;; CHECK-NEXT:    (local.get $0)
+  ;; CHECK-NEXT:   )
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
   ;; CHECK-NEXT:   (local.get $1)
+  ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (drop
   ;; CHECK-NEXT:   (i32.const 0)
   ;; CHECK-NEXT:  )
+  ;; CHECK-NEXT:  (i32.const 0)
   ;; CHECK-NEXT: )
   (func $rmw-cmpxchg-acqrel (param (ref $A) i32) (result i32)
-    ;; Making the accesses acqrel instead of seqcst means that the replacement
-    ;; fence could be more expensive than the original op, so we don't optimize.
+    ;; Acqrel accesses to constant fields do not synchronize with anything, so
+    ;; we can optimize without fences.
     (struct.atomic.rmw.cmpxchg acqrel acqrel $A 0
       (local.get 0)
       (local.get 1)

diff --git a/test/lit/passes/cfp.wast b/test/lit/passes/cfp.wast
@@ -2870,8 +2870,13 @@
   ;; CHECK-NEXT:   )
   ;; CHECK-NEXT:  )
   ;; CHECK-NEXT:  (drop
-  ;; CHECK-NEXT:   (struct.atomic.get acqrel $shared 0
-  ;; CHECK-NEXT:    (local.get $0)
+  ;; CHECK-NEXT:   (block (result i32)
+  ;; CHECK-NEXT:    (drop
+  ;; CHECK-NEXT:     (ref.as_non_null
+  ;; CHECK-NEXT:      (local.get $0)
+  ;; CHECK-NEXT:     )
+  ;; CHECK-NEXT:    )
+  ;; CHECK-NEXT:    (i32.const 0)
   ;; CHECK-NEXT:   )
   ;; CHECK-NEXT:  )
   ;; CHECK-NEXT:  (drop
@@ -2893,8 +2898,7 @@
       )
     )
     (drop
-      ;; This is not optimized because we wouldn't want to replace it with a
-      ;; stronger acquire fence.
+      ;; This can be optimzied and does not require a fence.
       (struct.atomic.get acqrel $shared 0
         (local.get 0)
       )