From 575ca6744b755f75799c1d092f56953e776a80a6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 11 Mar 2024 15:36:22 +0000
Subject: [PATCH 01/95] [CodeGen] Remove unused MachineRegisterInfo methods

---
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h |  9 ---------
 llvm/lib/CodeGen/MachineRegisterInfo.cpp        | 12 ------------
 2 files changed, 21 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 3f0fc160f9ea4..09d9a0b4ec402 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -243,15 +243,6 @@ class MachineRegisterInfo {
   /// Returns true if the updated CSR list was initialized and false otherwise.
   bool isUpdatedCSRsInitialized() const { return IsUpdatedCSRsInitialized; }
 
-  /// Returns true if a register can be used as an argument to a function.
-  bool isArgumentRegister(MCRegister Reg) const;
-
-  /// Returns true if a register is a fixed register.
-  bool isFixedRegister(MCRegister Reg) const;
-
-  /// Returns true if a register is a general purpose register.
-  bool isGeneralPurposeRegister(MCRegister Reg) const;
-
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
   /// \see UpdatedCalleeSavedRegs.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 55d7c8370e9c4..b0c1838b3ff0e 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -659,15 +659,3 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   }
   return false;
 }
-
-bool MachineRegisterInfo::isArgumentRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isArgumentRegister(*MF, Reg);
-}
-
-bool MachineRegisterInfo::isFixedRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isFixedRegister(*MF, Reg);
-}
-
-bool MachineRegisterInfo::isGeneralPurposeRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isGeneralPurposeRegister(*MF, Reg);
-}

From a924da6d4b8733e5bf08098b18dd7ad1a5ba5f46 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@iml.fraunhofer.de>
Date: Mon, 11 Mar 2024 16:47:06 +0100
Subject: [PATCH 02/95] [mlir][IR] Add `isInteger()` (without width) (#84467)

For the singless and signed integers overloads exist, so that the width
does not need to be specified as an argument. This adds the same for
integers without checking for signedness.
---
 mlir/include/mlir/IR/Types.h | 3 ++-
 mlir/lib/IR/Types.cpp        | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index 46bb733101c12..a89e13b625bf4 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -133,7 +133,8 @@ class Type {
   bool isF80() const;
   bool isF128() const;
 
-  /// Return true if this is an integer type with the specified width.
+  /// Return true if this is an integer type (with the specified width).
+  bool isInteger() const;
   bool isInteger(unsigned width) const;
   /// Return true if this is a signless integer type (with the specified width).
   bool isSignlessInteger() const;
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
index 32dfef9e81049..1d1ba6df4db2f 100644
--- a/mlir/lib/IR/Types.cpp
+++ b/mlir/lib/IR/Types.cpp
@@ -55,6 +55,8 @@ bool Type::isF128() const { return llvm::isa<Float128Type>(*this); }
 
 bool Type::isIndex() const { return llvm::isa<IndexType>(*this); }
 
+bool Type::isInteger() const { return llvm::isa<IntegerType>(*this); }
+
 /// Return true if this is an integer type with the specified width.
 bool Type::isInteger(unsigned width) const {
   if (auto intTy = llvm::dyn_cast<IntegerType>(*this))

From 0858c906db008e02163e159158c082d9fc82dcca Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 11 Mar 2024 14:18:58 +0000
Subject: [PATCH 03/95] [X86] Add missing register qualifier to the
 VBLENDVPD/VBLENDVPS/VPBLENDVB instruction names

Matches the SSE variants (which has a 0 qualifier to indicate the xmm0 explicit dependency)
---
 llvm/lib/Target/X86/X86FastISel.cpp           |  2 +-
 llvm/lib/Target/X86/X86InstrSSE.td            | 48 +++++++++----------
 llvm/lib/Target/X86/X86SchedAlderlakeP.td     |  8 ++--
 llvm/lib/Target/X86/X86SchedSapphireRapids.td | 14 +++---
 llvm/test/TableGen/x86-fold-tables.inc        | 12 ++---
 5 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 9f0b5f32df20a..48d3b68b1823a 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2230,7 +2230,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     unsigned CmpOpcode =
       (RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;
     unsigned BlendOpcode =
-      (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+      (RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;
 
     Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
                                        CC);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 4a542b7e5a1bb..69d45366a1dbc 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6266,27 +6266,27 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                                 X86MemOperand x86memop, ValueType VT,
                                 PatFrag mem_frag, SDNode OpNode,
                                 X86FoldableSchedWrite sched> {
-  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
-                  (ins RC:$src1, RC:$src2, RC:$src3),
-                  !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
-                  SSEPackedInt>, TA, PD, VEX, VVVV,
-                Sched<[sched]>;
+  def rrr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
+                   SSEPackedInt>, TA, PD, VEX, VVVV,
+                 Sched<[sched]>;
 
-  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
-                  (ins RC:$src1, x86memop:$src2, RC:$src3),
-                  !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                  [(set RC:$dst,
-                        (OpNode RC:$src3, (mem_frag addr:$src2),
-                                RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
-                Sched<[sched.Folded, sched.ReadAfterFold,
-                       // x86memop:$src2
-                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                       ReadDefault,
-                       // RC::$src3
-                       sched.ReadAfterFold]>;
+  def rmr : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst,
+                         (OpNode RC:$src3, (mem_frag addr:$src2),
+                                 RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
+                 Sched<[sched.Folded, sched.ReadAfterFold,
+                        // x86memop:$src2
+                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                        ReadDefault,
+                        // RC::$src3
+                        sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6320,16 +6320,16 @@ defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
                               (v4i32 VR128:$src2))),
-            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+            (VBLENDVPSrrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
                               (v2i64 VR128:$src2))),
-            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+            (VBLENDVPDrrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
                               (v8i32 VR256:$src2))),
-            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+            (VBLENDVPSYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
                               (v4i64 VR256:$src2))),
-            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+            (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 }
 
 // Prefer a movss or movsd over a blendps when optimizing for size. these were
diff --git a/llvm/lib/Target/X86/X86SchedAlderlakeP.td b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
index 4dc5ea3c86112..6f9d2cf7ffdf4 100644
--- a/llvm/lib/Target/X86/X86SchedAlderlakeP.td
+++ b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
@@ -2158,16 +2158,16 @@ def ADLPWriteResGroup244 : SchedWriteRes<[ADLPPort00_01_05, ADLPPort02_03_11]> {
   let Latency = 9;
   let NumMicroOps = 4;
 }
-def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rm$")>;
-def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrm)>;
+def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rmr$")>;
+def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrmr)>;
 
 def ADLPWriteResGroup245 : SchedWriteRes<[ADLPPort00_01_05]> {
   let ReleaseAtCycles = [3];
   let Latency = 3;
   let NumMicroOps = 3;
 }
-def : InstRW<[ADLPWriteResGroup245], (instregex "^VBLENDVP(D|S)rr$")>;
-def : InstRW<[ADLPWriteResGroup245], (instrs VPBLENDVBrr)>;
+def : InstRW<[ADLPWriteResGroup245], (instregex "^VBLENDVP(D|S)rrr$")>;
+def : InstRW<[ADLPWriteResGroup245], (instrs VPBLENDVBrrr)>;
 
 def ADLPWriteResGroup246 : SchedWriteRes<[ADLPPort00, ADLPPort01, ADLPPort02_03_11]> {
   let ReleaseAtCycles = [6, 7, 18];
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 3c698d2c9f7a0..88bb9ad8f1d74 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2673,25 +2673,25 @@ def SPRWriteResGroup259 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let Latency = 10;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)Yrm$")>;
-def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBYrm)>;
+def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)Yrmr$")>;
+def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBYrmr)>;
 
 def SPRWriteResGroup260 : SchedWriteRes<[SPRPort00_01_05]> {
   let ReleaseAtCycles = [3];
   let Latency = 3;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup260], (instregex "^VBLENDVP(S|DY)rr$",
-                                               "^VBLENDVP(D|SY)rr$",
-                                               "^VPBLENDVB(Y?)rr$")>;
+def : InstRW<[SPRWriteResGroup260], (instregex "^VBLENDVP(S|DY)rrr$",
+                                               "^VBLENDVP(D|SY)rrr$",
+                                               "^VPBLENDVB(Y?)rrr$")>;
 
 def SPRWriteResGroup261 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let ReleaseAtCycles = [3, 1];
   let Latency = 9;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rm$")>;
-def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrm)>;
+def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rmr$")>;
+def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrmr)>;
 
 def SPRWriteResGroup262 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let Latency = 9;
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index e0fccd42e47f7..eea4f87cae9ce 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -2363,10 +2363,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VBLENDPDrri, X86::VBLENDPDrmi, 0},
   {X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0},
   {X86::VBLENDPSrri, X86::VBLENDPSrmi, 0},
-  {X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0},
-  {X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0},
-  {X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0},
-  {X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0},
+  {X86::VBLENDVPDYrrr, X86::VBLENDVPDYrmr, 0},
+  {X86::VBLENDVPDrrr, X86::VBLENDVPDrmr, 0},
+  {X86::VBLENDVPSYrrr, X86::VBLENDVPSYrmr, 0},
+  {X86::VBLENDVPSrrr, X86::VBLENDVPSrmr, 0},
   {X86::VBROADCASTF32X2Z256rrkz, X86::VBROADCASTF32X2Z256rmkz, TB_NO_REVERSE},
   {X86::VBROADCASTF32X2Zrrkz, X86::VBROADCASTF32X2Zrmkz, TB_NO_REVERSE},
   {X86::VBROADCASTI32X2Z128rrkz, X86::VBROADCASTI32X2Z128rmkz, TB_NO_REVERSE},
@@ -3042,8 +3042,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VPBLENDMWZ128rr, X86::VPBLENDMWZ128rm, 0},
   {X86::VPBLENDMWZ256rr, X86::VPBLENDMWZ256rm, 0},
   {X86::VPBLENDMWZrr, X86::VPBLENDMWZrm, 0},
-  {X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0},
-  {X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0},
+  {X86::VPBLENDVBYrrr, X86::VPBLENDVBYrmr, 0},
+  {X86::VPBLENDVBrrr, X86::VPBLENDVBrmr, 0},
   {X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0},
   {X86::VPBLENDWrri, X86::VPBLENDWrmi, 0},
   {X86::VPBROADCASTBZ128rrkz, X86::VPBROADCASTBZ128rmkz, TB_NO_REVERSE},

From ad8c8281363261929b53b0a519cd20e9e2445343 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 11 Mar 2024 14:36:46 +0000
Subject: [PATCH 04/95] [X86] (V)MPSADBW instructions can run on Port1 or Port5
 for one uop stage

When we copied the IceLake model from the SkylakeServer model we missed this diff

Confirmed with uops.info and Agner
---
 llvm/lib/Target/X86/X86SchedIceLake.td                 |  6 +++---
 .../tools/llvm-mca/X86/IceLakeServer/resources-avx1.s  | 10 +++++-----
 .../tools/llvm-mca/X86/IceLakeServer/resources-avx2.s  | 10 +++++-----
 .../tools/llvm-mca/X86/IceLakeServer/resources-sse41.s | 10 +++++-----
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index c9ae9901ed5b8..3981279abc363 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -402,9 +402,9 @@ defm : ICXWriteResPair<WriteBlendZ,[ICXPort15], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteVarBlend, [ICXPort015], 2, [2], 2, 6>; // Vector variable blends.
 defm : ICXWriteResPair<WriteVarBlendY,[ICXPort015], 2, [2], 2, 6>;
 defm : ICXWriteResPair<WriteVarBlendZ,[ICXPort05],  2, [1], 1, 6>;
-defm : ICXWriteResPair<WriteMPSAD,   [ICXPort5], 4, [2], 2, 6>; // Vector MPSAD.
-defm : ICXWriteResPair<WriteMPSADY,  [ICXPort5], 4, [2], 2, 7>;
-defm : ICXWriteResPair<WriteMPSADZ,  [ICXPort5], 4, [2], 2, 7>;
+defm : ICXWriteResPair<WriteMPSAD,   [ICXPort15,ICXPort5], 4, [1,1], 2, 6>; // Vector MPSAD.
+defm : ICXWriteResPair<WriteMPSADY,  [ICXPort15,ICXPort5], 4, [1,1], 2, 7>;
+defm : ICXWriteResPair<WriteMPSADZ,  [ICXPort15,ICXPort5], 4, [1,1], 2, 7>;
 defm : ICXWriteResPair<WritePSADBW,  [ICXPort5], 3, [1], 1, 5>; // Vector PSADBW.
 defm : ICXWriteResPair<WritePSADBWX, [ICXPort5], 3, [1], 1, 6>;
 defm : ICXWriteResPair<WritePSADBWY, [ICXPort5], 3, [1], 1, 7>;
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
index e467c4e48ebd2..f184d5579d06e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
@@ -1337,8 +1337,8 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.33                        vmovups	%ymm0, %ymm2
 # CHECK-NEXT:  2      1     0.50           *            vmovups	%ymm0, (%rax)
 # CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm2
-# CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      10    2.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%ymm0, %ymm1, %ymm2
@@ -1738,7 +1738,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     126.00 322.92 232.92 160.50 160.50 19.00  296.92 6.25   19.00  19.00  19.00
+# CHECK-NEXT:  -     126.00 322.92 233.92 160.50 160.50 19.00  295.92 6.25   19.00  19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2049,8 +2049,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%ymm0, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovups	%ymm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovups	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
index 97f0d052f4552..dcf883445ba4e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
@@ -476,8 +476,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinserti128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm0
-# CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      11    2.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      4     1.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%ymm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpabsb	(%rax), %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsd	%ymm0, %ymm2
@@ -778,7 +778,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     110.33 103.33 98.00  98.00  2.50   150.33  -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     110.33 104.33 98.00  98.00  2.50   149.33  -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -798,8 +798,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vinserti128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovntdqa	(%rax), %ymm0
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsb	%ymm0, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vpabsb	(%rax), %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsd	%ymm0, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
index 554d7aad54bad..05c208b1c622b 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
@@ -172,8 +172,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  2      4     2.00                        mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  3      10    2.00    *                   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        packusdw	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   packusdw	(%rax), %xmm2
 # CHECK-NEXT:  2      2     0.67                        pblendvb	%xmm0, %xmm0, %xmm2
@@ -268,7 +268,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     36.67  41.67  22.00  22.00  2.50   53.67   -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     36.67  42.67  22.00  22.00  2.50   52.67   -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -289,8 +289,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     insertps	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     packusdw	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     packusdw	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     0.67    -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2

From 2c93beccdf8e026534a737eddaf8f5f26f3a23c3 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Mon, 11 Mar 2024 09:02:43 -0700
Subject: [PATCH 05/95] [InstallAPI] Collect C++ Decls (#84403)

This includes capturing symbols for global variables, functions,
classes, and templated defintions. As pre-determing what symbols are
generated from C++ declarations can be non-trivial, InstallAPI only
parses select declarations for symbol generation when parsing c++.

For example, installapi only looks at explicit template instantiations
or full template specializations, instead of general function or class
templates, for symbol emittion.
---
 clang/include/clang/InstallAPI/Visitor.h |  14 +
 clang/lib/InstallAPI/Frontend.cpp        |   4 +-
 clang/lib/InstallAPI/Visitor.cpp         | 426 +++++++++++++++++-
 clang/test/InstallAPI/cpp.test           | 530 +++++++++++++++++++++++
 clang/tools/clang-installapi/Options.cpp |  33 +-
 clang/tools/clang-installapi/Options.h   |   7 +
 6 files changed, 1008 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/InstallAPI/cpp.test

diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
index 71d4d9894f420..9ac948ded3e33 100644
--- a/clang/include/clang/InstallAPI/Visitor.h
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Twine.h"
 
 namespace clang {
+struct AvailabilityInfo;
 namespace installapi {
 
 /// ASTVisitor for collecting declarations that represent global symbols.
@@ -33,6 +34,7 @@ class InstallAPIVisitor final : public ASTConsumer,
         MC(ItaniumMangleContext::create(ASTCtx, ASTCtx.getDiagnostics())),
         Layout(ASTCtx.getTargetInfo().getDataLayoutString()) {}
   void HandleTranslationUnit(ASTContext &ASTCtx) override;
+  bool shouldVisitTemplateInstantiations() const { return true; }
 
   /// Collect global variables.
   bool VisitVarDecl(const VarDecl *D);
@@ -51,9 +53,19 @@ class InstallAPIVisitor final : public ASTConsumer,
   /// is therefore itself not collected.
   bool VisitObjCCategoryDecl(const ObjCCategoryDecl *D);
 
+  /// Collect global c++ declarations.
+  bool VisitCXXRecordDecl(const CXXRecordDecl *D);
+
 private:
   std::string getMangledName(const NamedDecl *D) const;
   std::string getBackendMangledName(llvm::Twine Name) const;
+  std::string getMangledCXXVTableName(const CXXRecordDecl *D) const;
+  std::string getMangledCXXThunk(const GlobalDecl &D,
+                                 const ThunkInfo &Thunk) const;
+  std::string getMangledCXXRTTI(const CXXRecordDecl *D) const;
+  std::string getMangledCXXRTTIName(const CXXRecordDecl *D) const;
+  std::string getMangledCtorDtor(const CXXMethodDecl *D, int Type) const;
+
   std::optional<HeaderType> getAccessForDecl(const NamedDecl *D) const;
   void recordObjCInstanceVariables(
       const ASTContext &ASTCtx, llvm::MachO::ObjCContainerRecord *Record,
@@ -61,6 +73,8 @@ class InstallAPIVisitor final : public ASTConsumer,
       const llvm::iterator_range<
           DeclContext::specific_decl_iterator<ObjCIvarDecl>>
           Ivars);
+  void emitVTableSymbols(const CXXRecordDecl *D, const AvailabilityInfo &Avail,
+                         const HeaderType Access, bool EmittedVTable = false);
 
   InstallAPIContext &Ctx;
   SourceManager &SrcMgr;
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 1edbdf5bb9836..0d526fe1da666 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -137,9 +137,9 @@ std::unique_ptr<MemoryBuffer> createInputBuffer(InstallAPIContext &Ctx) {
     else
       OS << "#import ";
     if (H.useIncludeName())
-      OS << "<" << H.getIncludeName() << ">";
+      OS << "<" << H.getIncludeName() << ">\n";
     else
-      OS << "\"" << H.getPath() << "\"";
+      OS << "\"" << H.getPath() << "\"\n";
 
     Ctx.addKnownHeader(H);
   }
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index 1f2ef08e5aa25..aded94f7a94a3 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/InstallAPI/Visitor.h"
+#include "clang/AST/Availability.h"
 #include "clang/AST/ParentMapContext.h"
+#include "clang/AST/VTableBuilder.h"
 #include "clang/Basic/Linkage.h"
 #include "clang/InstallAPI/Frontend.h"
 #include "llvm/ADT/SmallString.h"
@@ -18,6 +20,15 @@
 using namespace llvm;
 using namespace llvm::MachO;
 
+namespace {
+enum class CXXLinkage {
+  ExternalLinkage,
+  LinkOnceODRLinkage,
+  WeakODRLinkage,
+  PrivateLinkage,
+};
+}
+
 namespace clang::installapi {
 
 // Exported NamedDecl needs to have external linkage and
@@ -53,7 +64,7 @@ static bool isInlined(const FunctionDecl *D) {
   return true;
 }
 
-static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal) {
+static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal = false) {
   SymbolFlags Result = SymbolFlags::None;
   if (WeakDef)
     Result |= SymbolFlags::WeakDefined;
@@ -277,8 +288,417 @@ bool InstallAPIVisitor::VisitFunctionDecl(const FunctionDecl *D) {
                                     ? RecordLinkage::Internal
                                     : RecordLinkage::Exported;
   Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail, D,
-                       *Access, getFlags(WeakDef, /*ThreadLocal=*/false),
-                       Inlined);
+                       *Access, getFlags(WeakDef), Inlined);
+  return true;
+}
+
+static bool hasVTable(const CXXRecordDecl *D) {
+  // Check if vtable symbols should be emitted, only dynamic classes need
+  // vtables.
+  if (!D->hasDefinition() || !D->isDynamicClass())
+    return false;
+
+  assert(D->isExternallyVisible() && "Should be externally visible");
+  assert(D->isCompleteDefinition() && "Only works on complete definitions");
+
+  const CXXMethodDecl *KeyFunctionD =
+      D->getASTContext().getCurrentKeyFunction(D);
+  // If this class has a key function, then there is a vtable, possibly internal
+  // though.
+  if (KeyFunctionD) {
+    switch (KeyFunctionD->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+    case TSK_ImplicitInstantiation:
+    case TSK_ExplicitInstantiationDefinition:
+      return true;
+    case TSK_ExplicitInstantiationDeclaration:
+      llvm_unreachable(
+          "Unexpected TemplateSpecializationKind for key function");
+    }
+  } else if (D->isAbstract()) {
+    // If the class is abstract and it doesn't have a key function, it is a
+    // 'pure' virtual class. It doesn't need a vtable.
+    return false;
+  }
+
+  switch (D->getTemplateSpecializationKind()) {
+  case TSK_Undeclared:
+  case TSK_ExplicitSpecialization:
+  case TSK_ImplicitInstantiation:
+    return false;
+
+  case TSK_ExplicitInstantiationDeclaration:
+  case TSK_ExplicitInstantiationDefinition:
+    return true;
+  }
+
+  llvm_unreachable("Invalid TemplateSpecializationKind!");
+}
+
+static CXXLinkage getVTableLinkage(const CXXRecordDecl *D) {
+  assert((D->hasDefinition() && D->isDynamicClass()) && "Record has no vtable");
+  assert(D->isExternallyVisible() && "Record should be externally visible");
+  if (D->getVisibility() == HiddenVisibility)
+    return CXXLinkage::PrivateLinkage;
+
+  const CXXMethodDecl *KeyFunctionD =
+      D->getASTContext().getCurrentKeyFunction(D);
+  if (KeyFunctionD) {
+    // If this class has a key function, use that to determine the
+    // linkage of the vtable.
+    switch (KeyFunctionD->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+      if (isInlined(KeyFunctionD))
+        return CXXLinkage::LinkOnceODRLinkage;
+      return CXXLinkage::ExternalLinkage;
+    case TSK_ImplicitInstantiation:
+      llvm_unreachable("No external vtable for implicit instantiations");
+    case TSK_ExplicitInstantiationDefinition:
+      return CXXLinkage::WeakODRLinkage;
+    case TSK_ExplicitInstantiationDeclaration:
+      llvm_unreachable(
+          "Unexpected TemplateSpecializationKind for key function");
+    }
+  }
+
+  switch (D->getTemplateSpecializationKind()) {
+  case TSK_Undeclared:
+  case TSK_ExplicitSpecialization:
+  case TSK_ImplicitInstantiation:
+    return CXXLinkage::LinkOnceODRLinkage;
+  case TSK_ExplicitInstantiationDeclaration:
+  case TSK_ExplicitInstantiationDefinition:
+    return CXXLinkage::WeakODRLinkage;
+  }
+
+  llvm_unreachable("Invalid TemplateSpecializationKind!");
+}
+
+static bool isRTTIWeakDef(const CXXRecordDecl *D) {
+  if (D->hasAttr<WeakAttr>())
+    return true;
+
+  if (D->isAbstract() && D->getASTContext().getCurrentKeyFunction(D) == nullptr)
+    return true;
+
+  if (D->isDynamicClass())
+    return getVTableLinkage(D) != CXXLinkage::ExternalLinkage;
+
+  return false;
+}
+
+static bool hasRTTI(const CXXRecordDecl *D) {
+  if (!D->getASTContext().getLangOpts().RTTI)
+    return false;
+
+  if (!D->hasDefinition())
+    return false;
+
+  if (!D->isDynamicClass())
+    return false;
+
+  // Don't emit weak-def RTTI information. InstallAPI cannot reliably determine
+  // if the final binary will have those weak defined RTTI symbols. This depends
+  // on the optimization level and if the class has been instantiated and used.
+  //
+  // Luckily, the Apple static linker doesn't need those weak defined RTTI
+  // symbols for linking. They are only needed by the runtime linker. That means
+  // they can be safely dropped.
+  if (isRTTIWeakDef(D))
+    return false;
+
+  return true;
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXRTTIName(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXRTTIName(QualType(D->getTypeForDecl(), 0), NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getMangledCXXRTTI(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXRTTI(QualType(D->getTypeForDecl(), 0), NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXVTableName(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXVTable(D, NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXThunk(const GlobalDecl &D,
+                                      const ThunkInfo &Thunk) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  const auto *Method = cast<CXXMethodDecl>(D.getDecl());
+  if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method))
+    MC->mangleCXXDtorThunk(Dtor, D.getDtorType(), Thunk.This, NameStream);
+  else
+    MC->mangleThunk(Method, Thunk, NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getMangledCtorDtor(const CXXMethodDecl *D,
+                                                  int Type) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  GlobalDecl GD;
+  if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(D))
+    GD = GlobalDecl(Ctor, CXXCtorType(Type));
+  else {
+    const auto *Dtor = cast<CXXDestructorDecl>(D);
+    GD = GlobalDecl(Dtor, CXXDtorType(Type));
+  }
+  MC->mangleName(GD, NameStream);
+  return getBackendMangledName(Name);
+}
+
+void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
+                                          const AvailabilityInfo &Avail,
+                                          const HeaderType Access,
+                                          bool EmittedVTable) {
+  if (hasVTable(D)) {
+    EmittedVTable = true;
+    const CXXLinkage VTableLinkage = getVTableLinkage(D);
+    if (VTableLinkage == CXXLinkage::ExternalLinkage ||
+        VTableLinkage == CXXLinkage::WeakODRLinkage) {
+      const std::string Name = getMangledCXXVTableName(D);
+      const bool WeakDef = VTableLinkage == CXXLinkage::WeakODRLinkage;
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Variable, Avail, D, Access,
+                           getFlags(WeakDef));
+      if (!D->getDescribedClassTemplate() && !D->isInvalidDecl()) {
+        VTableContextBase *VTable = D->getASTContext().getVTableContext();
+        auto AddThunk = [&](GlobalDecl GD) {
+          const ItaniumVTableContext::ThunkInfoVectorTy *Thunks =
+              VTable->getThunkInfo(GD);
+          if (!Thunks)
+            return;
+
+          for (const auto &Thunk : *Thunks) {
+            const std::string Name = getMangledCXXThunk(GD, Thunk);
+            Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                 GlobalRecord::Kind::Function, Avail,
+                                 GD.getDecl(), Access);
+          }
+        };
+
+        for (const auto *Method : D->methods()) {
+          if (isa<CXXConstructorDecl>(Method) || !Method->isVirtual())
+            continue;
+
+          if (auto Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
+            // Skip default destructor.
+            if (Dtor->isDefaulted())
+              continue;
+            AddThunk({Dtor, Dtor_Deleting});
+            AddThunk({Dtor, Dtor_Complete});
+          } else
+            AddThunk(Method);
+        }
+      }
+    }
+  }
+
+  if (!EmittedVTable)
+    return;
+
+  if (hasRTTI(D)) {
+    std::string Name = getMangledCXXRTTI(D);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, Access);
+
+    Name = getMangledCXXRTTIName(D);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, Access);
+  }
+
+  for (const auto &It : D->bases()) {
+    const CXXRecordDecl *Base =
+        cast<CXXRecordDecl>(It.getType()->castAs<RecordType>()->getDecl());
+    const auto BaseAccess = getAccessForDecl(Base);
+    if (!BaseAccess)
+      continue;
+    const AvailabilityInfo BaseAvail = AvailabilityInfo::createFromDecl(Base);
+    emitVTableSymbols(Base, BaseAvail, *BaseAccess, /*EmittedVTable=*/true);
+  }
+}
+
+bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
+  if (!D->isCompleteDefinition())
+    return true;
+
+  // Skip templated classes.
+  if (D->getDescribedClassTemplate() != nullptr)
+    return true;
+
+  // Skip partial templated classes too.
+  if (isa<ClassTemplatePartialSpecializationDecl>(D))
+    return true;
+
+  auto Access = getAccessForDecl(D);
+  if (!Access)
+    return true;
+  const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(D);
+
+  // Check whether to emit the vtable/rtti symbols.
+  if (isExported(D))
+    emitVTableSymbols(D, Avail, *Access);
+
+  TemplateSpecializationKind ClassSK = TSK_Undeclared;
+  bool KeepInlineAsWeak = false;
+  if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
+    ClassSK = Templ->getTemplateSpecializationKind();
+    if (ClassSK == TSK_ExplicitInstantiationDeclaration)
+      KeepInlineAsWeak = true;
+  }
+
+  // Record the class methods.
+  for (const auto *M : D->methods()) {
+    // Inlined methods are usually not emitted, except when it comes from a
+    // specialized template.
+    bool WeakDef = false;
+    if (isInlined(M)) {
+      if (!KeepInlineAsWeak)
+        continue;
+
+      WeakDef = true;
+    }
+
+    if (!isExported(M))
+      continue;
+
+    switch (M->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+      break;
+    case TSK_ImplicitInstantiation:
+      continue;
+    case TSK_ExplicitInstantiationDeclaration:
+      if (ClassSK == TSK_ExplicitInstantiationDeclaration)
+        WeakDef = true;
+      break;
+    case TSK_ExplicitInstantiationDefinition:
+      WeakDef = true;
+      break;
+    }
+
+    if (!M->isUserProvided())
+      continue;
+
+    // Methods that are deleted are not exported.
+    if (M->isDeleted())
+      continue;
+
+    const auto Access = getAccessForDecl(M);
+    if (!Access)
+      return true;
+    const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(M);
+
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(M)) {
+      // Defaulted constructors are not exported.
+      if (Ctor->isDefaulted())
+        continue;
+
+      std::string Name = getMangledCtorDtor(M, Ctor_Base);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      if (!D->isAbstract()) {
+        std::string Name = getMangledCtorDtor(M, Ctor_Complete);
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Function, Avail, D, *Access,
+                             getFlags(WeakDef));
+      }
+
+      continue;
+    }
+
+    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(M)) {
+      // Defaulted destructors are not exported.
+      if (Dtor->isDefaulted())
+        continue;
+
+      std::string Name = getMangledCtorDtor(M, Dtor_Base);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      Name = getMangledCtorDtor(M, Dtor_Complete);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      if (Dtor->isVirtual()) {
+        Name = getMangledCtorDtor(M, Dtor_Deleting);
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Function, Avail, D, *Access,
+                             getFlags(WeakDef));
+      }
+
+      continue;
+    }
+
+    // Though abstract methods can map to exports, this is generally unexpected.
+    // Except in the case of destructors. Only ignore pure virtuals after
+    // checking if the member function was a destructor.
+    if (M->isPureVirtual())
+      continue;
+
+    std::string Name = getMangledName(M);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Function, Avail, D, *Access,
+                         getFlags(WeakDef));
+  }
+
+  if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
+    if (!Templ->isExplicitInstantiationOrSpecialization())
+      return true;
+  }
+
+  using var_iter = CXXRecordDecl::specific_decl_iterator<VarDecl>;
+  using var_range = iterator_range<var_iter>;
+  for (const auto *Var : var_range(D->decls())) {
+    // Skip const static member variables.
+    // \code
+    // struct S {
+    //   static const int x = 0;
+    // };
+    // \endcode
+    if (Var->isStaticDataMember() && Var->hasInit())
+      continue;
+
+    // Skip unexported var decls.
+    if (!isExported(Var))
+      continue;
+
+    const std::string Name = getMangledName(Var);
+    const auto Access = getAccessForDecl(Var);
+    if (!Access)
+      return true;
+    const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(Var);
+    const bool WeakDef = Var->hasAttr<WeakAttr>() || KeepInlineAsWeak;
+
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, *Access,
+                         getFlags(WeakDef));
+  }
+
   return true;
 }
 
diff --git a/clang/test/InstallAPI/cpp.test b/clang/test/InstallAPI/cpp.test
new file mode 100644
index 0000000000000..4817899095302
--- /dev/null
+++ b/clang/test/InstallAPI/cpp.test
@@ -0,0 +1,530 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// Invoke C++ with no-rtti.
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include -I%t/usr/local/include -x c++ \
+// RUN: -install_name @rpath/lib/libcpp.dylib -fno-rtti \
+// RUN: %t/inputs.json -o %t/no-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// RUN: llvm-readtapi -compare %t/no-rtti.tbd \
+// RUN: %t/expected-no-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// Invoke C++ with rtti.
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include -I%t/usr/local/include -x c++ \
+// RUN: -install_name @rpath/lib/libcpp.dylib -frtti \
+// RUN: %t/inputs.json -o %t/rtti.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/rtti.tbd \
+// RUN: %t/expected-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error: 
+// CHECK-NOT: warning: 
+
+//--- usr/include/basic.h
+#ifndef CPP_H
+#define CPP_H
+
+inline int foo(int x) { return x + 1; }
+
+extern int bar(int x) { return x + 1; }
+
+inline int baz(int x) {
+  static const int a[] = {1, 2, 3};
+  return a[x];
+}
+
+extern "C" {
+  int cFunc(const char*);
+}
+
+class Bar {
+public:
+  static const int x = 0;
+  static int y;
+
+  inline int func1(int x) { return x + 2; }
+  inline int func2(int x);
+  int func3(int x);
+};
+
+class __attribute__((visibility("hidden"))) BarI {
+  static const int x = 0;
+  static int y;
+
+  inline int func1(int x) { return x + 2; }
+  inline int func2(int x);
+  int func3(int x);
+};
+
+int Bar::func2(int x) { return x + 3; }
+inline int Bar::func3(int x) { return x + 4; }
+
+int BarI::func2(int x) { return x + 3; }
+inline int BarI::func3(int x) { return x + 4; }
+#endif
+
+//--- usr/local/include/vtable.h
+// Simple test class with no virtual functions. There should be no vtable or
+// RTTI.
+namespace test1 {
+class Simple {
+public:
+  void run();
+};
+} // end namespace test1
+
+// Simple test class with virtual function. There should be an external vtable
+// and RTTI.
+namespace test2 {
+class Simple {
+public:
+  virtual void run();
+};
+} // end namespace test2
+
+// Abstract class with no sub classes. There should be no vtable or RTTI.
+namespace test3 {
+class Abstract {
+public:
+  virtual ~Abstract() {}
+  virtual void run() = 0;
+};
+} // end namespace test3
+
+// Abstract base class with a sub class. There should be weak-def RTTI for the
+// abstract base class.
+// The sub-class should have vtable and RTTI.
+namespace test4 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run() override;
+};
+} // end namespace test4
+
+// Abstract base class with a sub class. Same as above, but with a user defined
+// inlined destructor.
+namespace test5 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  virtual ~Sub() {}
+  void run() override;
+};
+} // end namespace test5
+
+// Abstract base class with a sub class. Same as above, but with a different
+// inlined key method.
+namespace test6 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  virtual void foo() {}
+  void run() override;
+};
+} // end namespace test6
+
+// Abstract base class with a sub class. Overloaded method is implemented
+// inline. No vtable or RTTI.
+namespace test7 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual bool run() = 0;
+};
+
+class Sub : public Base {
+public:
+  bool run() override { return true; }
+};
+} // end namespace test7
+
+// Abstract base class with a sub class. Overloaded method has no inline
+// attribute and is recognized as key method,
+// but is later implemented inline. Weak-def RTTI only.
+namespace test8 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run() override;
+};
+
+inline void Sub::run() {}
+} // end namespace test8
+
+namespace test9 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override;
+};
+
+inline void Sub::run2() {}
+} // end namespace test9
+
+namespace test10 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  inline void run2() override;
+};
+
+void Sub::run2() {}
+} // end namespace test10
+
+namespace test11 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+  virtual void run3() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override;
+  void run3() override;
+};
+
+inline void Sub::run2() {}
+} // end namespace test11
+
+namespace test12 {
+template <class T> class Simple {
+public:
+  virtual void foo() {}
+};
+extern template class Simple<int>;
+} // end namespace test12
+
+namespace test13 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() {};
+  virtual void run3(); // key function.
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override {}
+};
+
+} // end namespace test13
+
+namespace test14 {
+
+class __attribute__((visibility("hidden"))) Base
+{
+public:
+    Base() {}
+    virtual ~Base(); // keyfunction.
+    virtual void run1() const = 0;
+};
+
+class Sub : public Base
+{
+public:
+    Sub();
+    virtual ~Sub();
+    virtual void run1() const;
+    void run2() const {}
+};
+
+} // end namespace test14
+
+namespace test15 {
+
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() {};
+};
+
+class Base1 {
+public:
+  virtual ~Base1() {}
+  virtual void run1() {};
+};
+
+class Sub : public Base, public Base1 {
+public:
+  Sub() {}
+  ~Sub();
+  void run() override;
+  void run1() override;
+};
+
+class Sub1 : public Base, public Base1 {
+public:
+  Sub1() {}
+  ~Sub1() = default;
+  void run() override;
+  void run1() override;
+};
+
+} // end namespace test15
+
+//--- usr/local/include/templates.h
+#ifndef TEMPLATES_H
+#define TEMPLATES_H
+
+namespace templates {
+
+// Full specialization.
+template <class T> int foo1(T a) { return 1; }
+template <> int foo1<int>(int a);
+extern template int foo1<short>(short a);
+
+template <class T> int foo2(T a);
+
+// Partial specialization.
+template <class A, class B> class Partial {
+  static int run(A a, B b) { return a + b; }
+};
+
+template <class A> class Partial<A, int> {
+  static int run(A a, int b) { return a - b; }
+};
+
+template <class T> class Foo {
+public:
+  Foo();
+  ~Foo();
+};
+
+template <class T> class Bar {
+public:
+  Bar();
+  ~Bar() {}
+
+  inline int bazinga() { return 7; }
+};
+
+extern template class Bar<int>;
+
+class Bazz {
+public:
+  Bazz() {}
+
+  template <class T> int buzz(T a);
+
+  float implicit() const { return foo1(0.0f); }
+};
+
+template <class T> int Bazz::buzz(T a) { return sizeof(T); }
+
+template <class T> struct S { static int x; };
+
+template <class T> int S<T>::x = 0;
+
+} // end namespace templates.
+
+#endif
+
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/basic.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/usr/local/include/vtable.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/usr/local/include/templates.h",
+    "type" : "private"
+  }
+  ],
+  "version": "3"
+}
+
+//--- expected-no-rtti.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "__ZTVN6test143SubE", "__ZTVN6test113SubE", "__ZTVN5test26SimpleE",
+            "__ZTVN5test53SubE", "__ZTVN6test154Sub1E", "__ZTVN6test153SubE",
+            "__ZN3Bar1yE", "__ZTVN5test43SubE", "__ZTVN5test63SubE",
+            "__ZTVN6test134BaseE"
+          ],
+          "weak": [
+            "__ZTVN6test126SimpleIiEE"
+          ]
+        },
+        "text": {
+          "global": [
+            "__ZN6test153Sub3runEv", "__ZN6test154Sub13runEv",
+            "__Z3bari", "__ZThn8_N6test153SubD1Ev",
+            "__ZNK6test143Sub4run1Ev", "__ZN6test154Sub14run1Ev",
+            "__ZThn8_N6test153Sub4run1Ev", "__ZN6test143SubD1Ev",
+            "__ZN6test134Base4run3Ev", "__ZN5test16Simple3runEv",
+            "__ZN5test43Sub3runEv", "__ZN6test113Sub4run3Ev", "__ZN6test153SubD2Ev",
+            "__ZN5test53Sub3runEv", "__ZN6test153SubD1Ev", "__ZN6test143SubC1Ev",
+            "__ZN9templates4foo1IiEEiT_", "__ZN6test143SubC2Ev", "__ZN5test63Sub3runEv",
+            "__ZN5test26Simple3runEv", "__ZN6test153SubD0Ev",
+            "__ZN6test143SubD2Ev", "__ZN6test153Sub4run1Ev", "__ZN6test143SubD0Ev",
+            "__ZThn8_N6test153SubD0Ev", "__ZThn8_N6test154Sub14run1Ev", "_cFunc"
+          ],
+          "weak": [
+            "__ZN9templates3BarIiED2Ev", "__ZN9templates3BarIiEC2Ev",
+            "__ZN9templates3BarIiEC1Ev", "__ZN9templates3BarIiED1Ev",
+            "__ZN6test126SimpleIiE3fooEv", "__ZN9templates3BarIiE7bazingaEv",
+            "__ZN9templates4foo1IsEEiT_"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libcpp.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+//--- expected-rtti.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "__ZTVN6test143SubE", "__ZTIN5test63SubE", "__ZTSN5test26SimpleE",
+            "__ZTIN6test153SubE", "__ZTVN6test113SubE", "__ZTIN5test43SubE",
+            "__ZTIN6test134BaseE", "__ZTVN5test26SimpleE", "__ZTIN5test26SimpleE",
+            "__ZTSN6test134BaseE", "__ZTVN6test154Sub1E", "__ZTVN5test43SubE",
+            "__ZTVN5test63SubE", "__ZTSN5test43SubE", "__ZTSN6test113SubE",
+            "__ZTIN6test154Sub1E", "__ZTSN6test153SubE", "__ZTSN5test63SubE",
+            "__ZTSN6test154Sub1E", "__ZTIN6test113SubE", "__ZTSN6test143SubE",
+            "__ZTVN5test53SubE", "__ZTIN6test143SubE", "__ZTVN6test153SubE",
+            "__ZTIN5test53SubE", "__ZN3Bar1yE", "__ZTVN6test134BaseE",
+            "__ZTSN5test53SubE"
+          ],
+          "weak": [
+            "__ZTVN6test126SimpleIiEE"
+          ]
+        },
+        "text": {
+          "global": [
+            "__ZN6test154Sub13runEv", "__ZN6test153Sub3runEv", "__ZNK6test143Sub4run1Ev",
+            "__ZN6test134Base4run3Ev", "__ZN5test16Simple3runEv", "__ZN6test153SubD2Ev",
+            "__ZN6test143SubC2Ev", "__ZN5test63Sub3runEv", "__ZN6test153SubD0Ev", 
+            "__ZN6test143SubD2Ev", "__ZThn8_N6test154Sub14run1Ev",
+            "__ZThn8_N6test153SubD0Ev", "__Z3bari", "__ZThn8_N6test153SubD1Ev",
+            "__ZN6test154Sub14run1Ev", "__ZThn8_N6test153Sub4run1Ev",
+            "__ZN6test143SubD1Ev", "__ZN5test43Sub3runEv",
+            "__ZN6test113Sub4run3Ev", "__ZN5test53Sub3runEv", "__ZN6test143SubC1Ev",
+            "__ZN6test153SubD1Ev", "__ZN9templates4foo1IiEEiT_", "__ZN5test26Simple3runEv",
+            "__ZN6test153Sub4run1Ev", "__ZN6test143SubD0Ev", "_cFunc"
+          ],
+          "weak": [
+            "__ZN9templates3BarIiEC2Ev", "__ZN9templates3BarIiEC1Ev",
+            "__ZN9templates3BarIiED1Ev", "__ZN6test126SimpleIiE3fooEv",
+            "__ZN9templates4foo1IsEEiT_", "__ZN9templates3BarIiED2Ev",
+            "__ZN9templates3BarIiE7bazingaEv"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libcpp.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index b9c36eab2ad3b..701ab81c57c3d 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -99,6 +99,33 @@ bool Options::processLinkerOptions(InputArgList &Args) {
   return true;
 }
 
+bool Options::processFrontendOptions(InputArgList &Args) {
+  // Do not claim any arguments, as they will be passed along for CC1
+  // invocations.
+  if (auto *A = Args.getLastArgNoClaim(OPT_x)) {
+    FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
+                          .Case("c", clang::Language::C)
+                          .Case("c++", clang::Language::CXX)
+                          .Case("objective-c", clang::Language::ObjC)
+                          .Case("objective-c++", clang::Language::ObjCXX)
+                          .Default(clang::Language::Unknown);
+
+    if (FEOpts.LangMode == clang::Language::Unknown) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(Args) << A->getValue();
+      return false;
+    }
+  }
+  for (auto *A : Args.filtered(OPT_ObjC, OPT_ObjCXX)) {
+    if (A->getOption().matches(OPT_ObjC))
+      FEOpts.LangMode = clang::Language::ObjC;
+    else
+      FEOpts.LangMode = clang::Language::ObjCXX;
+  }
+
+  return true;
+}
+
 Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
                  InputArgList &ArgList)
     : Diags(&Diag), FM(FM) {
@@ -108,7 +135,10 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   if (!processLinkerOptions(ArgList))
     return;
 
-  /// Any remaining arguments should be handled by invoking the clang frontend.
+  if (!processFrontendOptions(ArgList))
+    return;
+
+  /// Any unclaimed arguments should be handled by invoking the clang frontend.
   for (const Arg *A : ArgList) {
     if (A->isClaimed())
       continue;
@@ -132,6 +162,7 @@ InstallAPIContext Options::createContext() {
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
+  Ctx.LangMode = FEOpts.LangMode;
 
   // Process inputs.
   for (const std::string &ListPath : DriverOpts.FileLists) {
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index f68addf197288..9d4d841284fd1 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -62,15 +62,22 @@ struct LinkerOptions {
   bool IsDylib = false;
 };
 
+struct FrontendOptions {
+  /// \brief The language mode to parse headers in.
+  Language LangMode = Language::ObjC;
+};
+
 class Options {
 private:
   bool processDriverOptions(llvm::opt::InputArgList &Args);
   bool processLinkerOptions(llvm::opt::InputArgList &Args);
+  bool processFrontendOptions(llvm::opt::InputArgList &Args);
 
 public:
   /// The various options grouped together.
   DriverOptions DriverOpts;
   LinkerOptions LinkerOpts;
+  FrontendOptions FEOpts;
 
   Options() = delete;
 

From 34acdb3ec2113265ea221fb20747ecbffb4f6a2d Mon Sep 17 00:00:00 2001
From: annamthomas <anna@azul.com>
Date: Mon, 11 Mar 2024 12:16:52 -0400
Subject: [PATCH 06/95] Precommit testcase for pr81872 (#84782)

Testcase shows miscompile when dropping disjoint flag from disjoint or
during vectorization.
---
 .../Transforms/LoopVectorize/X86/pr81872.ll   | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr81872.ll

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
new file mode 100644
index 0000000000000..c6b1944b20090
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@global = external global ptr addrspace(1), align 8
+
+; PR 81872 explains the issue.
+
+; If we vectorize, we have a miscompile where array IV and thereby value stored in (arr[99],
+; arr[98]) is calculated incorrectly since disjoint or was only disjoint because
+; of dominating conditions. Dropping the disjoint to avoid poison still changes
+; the behaviour since now the or is no longer equivalent to the add.
+; Function Attrs: uwtable
+define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noundef align 8 dereferenceable_or_null(16) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb5:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB5:%.*]] ]
+; CHECK-NEXT:    br label [[BB15:%.*]]
+; CHECK:       bb15:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB20:%.*]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
+; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[BB20]], !prof [[PROF5:![0-9]+]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-NEXT:    [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
+; CHECK-NEXT:    store i64 1, ptr [[GETELEMENTPTR19]], align 8
+; CHECK-NEXT:    br label [[BB20]]
+; CHECK:       bb20:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
+; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[BB15]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       bb6:
+; CHECK-NEXT:    ret void
+;
+bb5:
+  br label %bb15
+
+bb15:                                             ; preds = %bb20, %bb8
+  %iv = phi i64 [ 99, %bb5 ], [ %iv.next, %bb20 ]
+  %and = and i64 %iv, 1
+  %icmp17 = icmp eq i64 %and, 0
+  br i1 %icmp17, label %bb18, label %bb20, !prof !21
+
+bb18:                                             ; preds = %bb15
+  %or = or disjoint i64 %iv, 1
+  %getelementptr19 = getelementptr inbounds i64, ptr %arr, i64 %or
+  store i64 1, ptr %getelementptr19, align 8
+  br label %bb20
+
+bb20:                                             ; preds = %bb18, %bb15
+  %iv.next = add nsw i64 %iv, -1
+  %icmp22 = icmp eq i64 %iv.next, 90
+  br i1 %icmp22, label %bb6, label %bb15, !prof !22
+
+bb6:
+  ret void
+}
+
+attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" }
+
+!4 = !{}
+!10 = !{i32 1}
+!16 = !{i64 864}
+!17 = !{i64 8}
+!21 = !{!"branch_weights", i32 1, i32 1}
+!22 = !{!"branch_weights", i32 1, i32 95}
+
+
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+;.

From 7dc4d5f6a0012d6a2485640f6c3c9ca388a02433 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 11 Mar 2024 16:22:18 +0000
Subject: [PATCH 07/95] [X86] Add AVX512 (x86-64-v4) coverage to generic shift
 combines tests

---
 llvm/test/CodeGen/X86/combine-shl.ll | 248 +++++++++++++++++----------
 llvm/test/CodeGen/X86/combine-sra.ll |  79 ++++++---
 llvm/test/CodeGen/X86/combine-srl.ll | 108 ++++++++----
 3 files changed, 293 insertions(+), 142 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index b485a9b10f26c..5472e1e6c0833 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (shl 0, x) -> 0
 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@@ -137,32 +138,40 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vzeroupper
-; AVX-SLOW-NEXT:    retq
-;
-; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
-; AVX-FAST-ALL:       # %bb.0:
-; AVX-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
-; AVX-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; AVX-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-FAST-ALL-NEXT:    vzeroupper
-; AVX-FAST-ALL-NEXT:    retq
-;
-; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
-; AVX-FAST-PERLANE:       # %bb.0:
-; AVX-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-FAST-PERLANE-NEXT:    vzeroupper
-; AVX-FAST-PERLANE-NEXT:    retq
+; AVX2-SLOW-LABEL: combine_vec_shl_trunc_and:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
+; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT:    vzeroupper
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vzeroupper
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = shl <4 x i32> %x, %2
@@ -353,11 +362,17 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_zext_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_zext_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_zext_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
   %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
   %2 = zext <8 x i16> %1 to <8 x i32>
   %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -504,12 +519,18 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_gt_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_gt_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_gt_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %2
@@ -540,12 +561,18 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_le_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_le_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_le_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %2
@@ -587,11 +614,16 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_ashr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_ashr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_ashr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %2
@@ -620,12 +652,18 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_add0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_add0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_add0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -667,12 +705,18 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
 ; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_or0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_or0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_or0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -724,11 +768,16 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_mul0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_mul0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_mul0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -778,12 +827,18 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shl_nonsplat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shl_nonsplat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shl_nonsplat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
   %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %2
@@ -812,14 +867,22 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shl_and_nonsplat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shl_and_nonsplat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
   %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
   %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
@@ -847,13 +910,20 @@ define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shuffle_shl:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shuffle_shl:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shuffle_shl:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
   %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index cc0ed2b8268c6..0aac99457d7de 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (sra 0, x) -> 0
 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
@@ -193,6 +194,14 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %x, %2
@@ -237,6 +246,14 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@@ -255,16 +272,23 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
 ; SSE-NEXT:    packsswb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_ashr_trunc_lshr_splat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrad $26, %ymm1, %ymm1
-; AVX-NEXT:    vpsrad $26, %ymm0, %ymm0
-; AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_lshr_splat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrad $26, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $26, %ymm0, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_lshr_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $26, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <16 x i32> %x, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
   %2 = trunc <16 x i32> %1 to <16 x i8>
   %3 = ashr <16 x i8> %2, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -309,6 +333,14 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@@ -323,13 +355,20 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_ashr_trunc_ashr_splat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrad $19, %ymm0, %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_ashr_splat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrad $19, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_ashr_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $19, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = ashr <8 x i32> %x, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %2 = trunc <8 x i32> %1 to <8 x i16>
   %3 = ashr <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index b38ab5d262814..79c86a6b012e9 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (srl 0, x) -> 0
 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
@@ -188,6 +189,13 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16>
@@ -243,6 +251,14 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_lshr1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
@@ -289,11 +305,16 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_shl_mask0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_lshr_shl_mask0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_shl_mask0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -338,12 +359,18 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 ; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $4, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_lshr_lzcnt_bit0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_lzcnt_bit0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
@@ -373,25 +400,32 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-NEXT:    psrld $5, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_lshr_lzcnt_bit1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_lzcnt_bit1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>
   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
@@ -448,6 +482,14 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %x, %2

From 6cd68c2f87832ef39eb502a20d358b4c7fa37b9e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 11 Mar 2024 16:25:05 +0000
Subject: [PATCH 08/95] [X86] Add base SSE2 coverage to SRL/SRA combines tests

---
 llvm/test/CodeGen/X86/combine-sra.ll | 270 +++++++++++++++++---------
 llvm/test/CodeGen/X86/combine-srl.ll | 275 +++++++++++++++++++--------
 2 files changed, 378 insertions(+), 167 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index 0aac99457d7de..0675ced68d7a7 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
@@ -86,19 +87,33 @@ define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_ashr_ashr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $10, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrad $6, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $8, %xmm1
-; SSE-NEXT:    psrad $4, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_ashr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $10, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $8, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $6, %xmm1
+; SSE2-NEXT:    psrad $4, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_ashr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $10, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $6, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $8, %xmm1
+; SSE41-NEXT:    psrad $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_ashr1:
 ; AVX:       # %bb.0:
@@ -125,16 +140,30 @@ define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_ashr_ashr3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $27, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrad $15, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_ashr3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $27, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    psrad $15, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_ashr3:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $27, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $15, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_ashr3:
 ; AVX:       # %bb.0:
@@ -147,26 +176,48 @@ define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
 
 ; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
 define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
-; SSE-LABEL: combine_vec_ashr_trunc_and:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrad %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrad %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrad %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrad %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_and:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_and:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
 ; AVX2-SLOW:       # %bb.0:
@@ -211,17 +262,31 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
 ;      if c1 is equal to the number of bits the trunc removes
 define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_ashr_trunc_lshr:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    psrad $2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    psrad $1, %xmm0
-; SSE-NEXT:    psrad $3, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_lshr:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    psrad $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_lshr:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $1, %xmm0
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
 ; AVX2-SLOW:       # %bb.0:
@@ -298,17 +363,31 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
 ; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
 ;      if c1 is equal to the number of bits the trunc removes
 define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_ashr_trunc_ashr:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    psrad $2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    psrad $1, %xmm0
-; SSE-NEXT:    psrad $3, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_ashr:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    psrad $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_ashr:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $1, %xmm0
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
 ; AVX2-SLOW:       # %bb.0:
@@ -377,25 +456,46 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
 
 ; If the sign bit is known to be zero, switch this to a SRL.
 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
-; SSE-LABEL: combine_vec_ashr_positive:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrld %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrld %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_positive:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_positive:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_positive:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 79c86a6b012e9..33649e6d87b91 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
@@ -102,19 +103,33 @@ define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_lshr_lshr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $10, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $6, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $8, %xmm1
-; SSE-NEXT:    psrld $4, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_lshr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $10, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $8, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $6, %xmm1
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_lshr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $10, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld $6, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $8, %xmm1
+; SSE41-NEXT:    psrld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lshr1:
 ; AVX:       # %bb.0:
@@ -158,12 +173,19 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
 
 ; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
 define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_lshr_trunc_lshr0:
-; SSE:       # %bb.0:
-; SSE-NEXT:    psrlq $48, %xmm1
-; SSE-NEXT:    psrlq $48, %xmm0
-; SSE-NEXT:    packusdw %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_lshr0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_lshr0:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrlq $48, %xmm1
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr0:
 ; AVX2-SLOW:       # %bb.0:
@@ -203,27 +225,50 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_lshr_trunc_lshr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psrlq $35, %xmm2
-; SSE-NEXT:    psrlq $34, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $33, %xmm2
-; SSE-NEXT:    psrlq $32, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
-; SSE-NEXT:    movaps %xmm2, %xmm1
-; SSE-NEXT:    psrld $19, %xmm1
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    psrld $17, %xmm3
-; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    psrld $18, %xmm2
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_lshr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $34, %xmm2
+; SSE2-NEXT:    psrlq $35, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    psrlq $33, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrld $19, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    psrld $18, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE2-NEXT:    psrld $17, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_lshr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $35, %xmm2
+; SSE41-NEXT:    psrlq $34, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    psrld $19, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm3
+; SSE41-NEXT:    psrld $17, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrld $18, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
 ; AVX2-SLOW:       # %bb.0:
@@ -378,27 +423,71 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pshufb %xmm0, %xmm2
-; SSE-NEXT:    psrlw $4, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pshufb %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    psrld $5, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_lzcnt_bit1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $4, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $8, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    psrld $5, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_lzcnt_bit1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $5, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX2:       # %bb.0:
@@ -435,26 +524,48 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
 
 ; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
 define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
-; SSE-LABEL: combine_vec_lshr_trunc_and:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrld %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrld %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_and:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_and:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
 ; AVX2-SLOW:       # %bb.0:

From 81e20472a0c5a4a8edc5ec38dc345d580681af81 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 11 Mar 2024 17:43:14 +0100
Subject: [PATCH 09/95] [cmake] Exposes LLVM version number in the runtimes.
 (#84641)

This allows sharing the LLVM version number in libc++.
---
 cmake/Modules/LLVMVersion.cmake | 15 +++++++++++++++
 llvm/CMakeLists.txt             | 13 +------------
 runtimes/CMakeLists.txt         |  2 ++
 3 files changed, 18 insertions(+), 12 deletions(-)
 create mode 100644 cmake/Modules/LLVMVersion.cmake

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
new file mode 100644
index 0000000000000..5e28283fbc1c6
--- /dev/null
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -0,0 +1,15 @@
+# The LLVM Version number information
+
+if(NOT DEFINED LLVM_VERSION_MAJOR)
+  set(LLVM_VERSION_MAJOR 19)
+endif()
+if(NOT DEFINED LLVM_VERSION_MINOR)
+  set(LLVM_VERSION_MINOR 0)
+endif()
+if(NOT DEFINED LLVM_VERSION_PATCH)
+  set(LLVM_VERSION_PATCH 0)
+endif()
+if(NOT DEFINED LLVM_VERSION_SUFFIX)
+  set(LLVM_VERSION_SUFFIX git)
+endif()
+
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 494d8abeb64d2..d9a17a869acfa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -15,18 +15,7 @@ if(NOT LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 endif()
 
-if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 19)
-endif()
-if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
-endif()
-if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
-endif()
-if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
-endif()
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/LLVMVersion.cmake)
 
 set_directory_properties(PROPERTIES LLVM_VERSION_MAJOR "${LLVM_VERSION_MAJOR}")
 
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 29b47b862c219..6f24fbcccec95 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -6,6 +6,8 @@ set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
 
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/LLVMVersion.cmake)
+
 project(Runtimes C CXX ASM)
 
 list(INSERT CMAKE_MODULE_PATH 0

From 9a9aa41dea83039154601082b1aa2c56e35a5a17 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 11 Mar 2024 17:45:48 +0100
Subject: [PATCH 10/95] [LLDB][doc] Updates build instructions. (#84630)

Recently building libc++ requires building libunwind too. This updates
the LLDB instructions.

I noticed this recently and it was separately filed as
https://github.com/llvm/llvm-project/issues/84053
---
 lldb/docs/resources/build.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 995273a97b653..09d3d15a94083 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -331,7 +331,7 @@ macOS
 ^^^^^
 
 On macOS the LLDB test suite requires libc++. Either add
-``LLVM_ENABLE_RUNTIMES="libcxx;libcxxabi"`` or disable the test suite with
+``LLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"`` or disable the test suite with
 ``LLDB_INCLUDE_TESTS=OFF``. Further useful options:
 
 * ``LLDB_BUILD_FRAMEWORK:BOOL``: Builds the LLDB.framework.
@@ -370,7 +370,7 @@ LLVM <https://llvm.org/docs/BuildingADistribution.html>`_):
   $ cmake -B /path/to/lldb-build -G Ninja \
           -C /path/to/llvm-project/lldb/cmake/caches/Apple-lldb-macOS.cmake \
           -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
           llvm-project/llvm
 
   $ DESTDIR=/path/to/lldb-install ninja -C /path/to/lldb-build check-lldb install-distribution
@@ -386,7 +386,7 @@ Build LLDB standalone for development with Xcode:
   $ cmake -B /path/to/llvm-build -G Ninja \
           -C /path/to/llvm-project/lldb/cmake/caches/Apple-lldb-base.cmake \
           -DLLVM_ENABLE_PROJECTS="clang" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
           llvm-project/llvm
   $ ninja -C /path/to/llvm-build
 

From 501bc101c04675969ab673b247f2a58fa72bd09e Mon Sep 17 00:00:00 2001
From: karzan <61278770+karzanWang@users.noreply.github.com>
Date: Tue, 12 Mar 2024 01:07:12 +0800
Subject: [PATCH 11/95] [lldb] Save the edited line before clearing it in
 Editline::PrintAsync (#84154)

If the `m_editor_status` is `EditorStatus::Editing`, PrintAsync clears
the currently edited line. In some situations, the edited line is not
saved. After the stream flushes, PrintAsync tries to display the unsaved
line, causing the loss of the edited line.

The issue arose while I was debugging REPRLRun in
[Fuzzilli](https://github.com/googleprojectzero/fuzzilli). I started
LLDB and attempted to set a breakpoint in libreprl-posix.c. I entered
`breakpoint set -f lib` and used the "tab" key for command completion.
After completion, the edited line was flushed, leaving a blank line.
---
 lldb/source/Host/common/Editline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index e66271e8a6ee9..ed61aecc23b9b 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1597,6 +1597,7 @@ bool Editline::GetLines(int first_line_number, StringList &lines,
 void Editline::PrintAsync(Stream *stream, const char *s, size_t len) {
   std::lock_guard<std::recursive_mutex> guard(m_output_mutex);
   if (m_editor_status == EditorStatus::Editing) {
+    SaveEditedLine();
     MoveCursor(CursorLocation::EditingCursor, CursorLocation::BlockStart);
     fprintf(m_output_file, ANSI_CLEAR_BELOW);
   }

From 07d7b9c255078edc6f04bd4e68416bdf3e8735ab Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 11 Mar 2024 18:17:18 +0100
Subject: [PATCH 12/95] [libc] Fix forward arm32 builtbot (#84794)

Introduced by https://github.com/llvm/llvm-project/pull/83441.
---
 libc/test/src/string/memory_utils/CMakeLists.txt | 1 +
 libc/test/src/string/memory_utils/op_tests.cpp   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 567f85e37bfab..a0dddd2f97b58 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,7 @@ add_libc_test(
     libc.src.__support.CPP.array
     libc.src.__support.CPP.cstddef
     libc.src.__support.CPP.span
+    libc.src.__support.macros.properties.types
     libc.src.__support.macros.sanitizer
     libc.src.string.memory_utils.memory_utils
   UNIT_TEST_ONLY
diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
index 95a04755eb4db..703a26b16b03f 100644
--- a/libc/test/src/string/memory_utils/op_tests.cpp
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -10,6 +10,7 @@
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64
 #include "src/string/memory_utils/op_aarch64.h"
 #include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
 #include "src/string/memory_utils/op_riscv.h"
 #include "src/string/memory_utils/op_x86.h"
 #include "test/UnitTest/Test.h"

From bdbad0d07bb600301cb324e87a6be37ca4af591a Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Mon, 11 Mar 2024 10:21:07 -0700
Subject: [PATCH 13/95] Turn off instruction flow control annotations by
 default (#84607)

Walter Erquinigo added optional instruction annotations for x86
instructions in 2022 for the `thread trace dump instruction` command,
and code to DisassemblerLLVMC to add annotations for instructions that
change flow control, v. https://reviews.llvm.org/D128477

This was added as an option to `disassemble`, and the trace dump command
enables it by default, but several other instruction dumpers were
changed to display them by default as well. These are only implemented
for Intel instructions, so our disassembly on other targets ends up
looking like

```
(lldb) x/5i 0x1000086e4
0x1000086e4: 0xa9be6ffc   unknown     stp    x28, x27, [sp, #-0x20]!
0x1000086e8: 0xa9017bfd   unknown     stp    x29, x30, [sp, #0x10]
0x1000086ec: 0x910043fd   unknown     add    x29, sp, #0x10
0x1000086f0: 0xd11843ff   unknown     sub    sp, sp, #0x610
0x1000086f4: 0x910c63e8   unknown     add    x8, sp, #0x318
```

instead of `disassemble`'s output style of

```
lldb`main:
lldb[0x1000086e4] <+0>:  stp    x28, x27, [sp, #-0x20]!
lldb[0x1000086e8] <+4>:  stp    x29, x30, [sp, #0x10]
lldb[0x1000086ec] <+8>:  add    x29, sp, #0x10
lldb[0x1000086f0] <+12>: sub    sp, sp, #0x610
lldb[0x1000086f4] <+16>: add    x8, sp, #0x318
```

Adding symbolic annotations for assembly instructions is something I'm
interested in too, because we may have users investigating a crash or
apparent-incorrect behavior who must debug optimized assembly and they
may not be familiar with the ISA they're using, so short of flipping
through a many-thousand-page PDF to understand each instruction, they're
lost. They don't write assembly or work at that level, but to understand
a bug, they have to understand what the instructions are actually doing.

But the annotations that exist today don't move us forward much on that
front - I'd argue that the flow control instructions on Intel are not
hard to understand from their names, but that might just be my personal
bias. Much trickier instructions exist in any event.

Displaying this information by default for all targets when we only have
one class of instructions on one target is not a good default.

Also, in 2011 when Greg implemented the `memory read -f i` (aka `x/i`)
command
```
commit 5009f9d5010a7e34ae15f962dac8505ea11a8716
Author: Greg Clayton <gclayton@apple.com>
Date:   Thu Oct 27 17:55:14 2011 +0000
[...]
    eFormatInstruction will print out disassembly with bytes and it will use the
    current target's architecture. The format character for this is "i" (which
    used to be being used for the integer format, but the integer format also has
    "d", so we gave the "i" format to disassembly), the long format is
    "instruction".
```

he had DumpDataExtractor's DumpInstructions print the bytes of the
instruction -- that's the first field we see above for the `x/5i` after
the address -- and this is only useful for people who are debugging the
disassembler itself, I would argue. I don't want this displayed by
default either.

tl;dr this patch removes both fields from `memory read -f -i` and I
think this is the right call today. While I'm really interested in
instruction annotation, I don't think `x/i` is the right place to have
it enabled by default unless it's really compelling on at least some of
our major targets.
---
 lldb/source/Core/DumpDataExtractor.cpp                        | 4 ++--
 lldb/source/Expression/IRExecutionUnit.cpp                    | 2 +-
 .../InstEmulation/UnwindAssemblyInstEmulation.cpp             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Core/DumpDataExtractor.cpp b/lldb/source/Core/DumpDataExtractor.cpp
index 986c9a181919e..826edd7bab046 100644
--- a/lldb/source/Core/DumpDataExtractor.cpp
+++ b/lldb/source/Core/DumpDataExtractor.cpp
@@ -150,8 +150,8 @@ static lldb::offset_t DumpInstructions(const DataExtractor &DE, Stream *s,
       if (bytes_consumed) {
         offset += bytes_consumed;
         const bool show_address = base_addr != LLDB_INVALID_ADDRESS;
-        const bool show_bytes = true;
-        const bool show_control_flow_kind = true;
+        const bool show_bytes = false;
+        const bool show_control_flow_kind = false;
         ExecutionContext exe_ctx;
         exe_scope->CalculateExecutionContext(exe_ctx);
         disassembler_sp->GetInstructionList().Dump(
diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index 0682746e448e3..e4e131d70d431 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -201,7 +201,7 @@ Status IRExecutionUnit::DisassembleFunction(Stream &stream,
                                       UINT32_MAX, false, false);
 
   InstructionList &instruction_list = disassembler_sp->GetInstructionList();
-  instruction_list.Dump(&stream, true, true, /*show_control_flow_kind=*/true,
+  instruction_list.Dump(&stream, true, true, /*show_control_flow_kind=*/false,
                         &exe_ctx);
 
   return ret;
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 7ff5cd2c23b07..c4a171ec7d01b 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -83,7 +83,7 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
       const uint32_t addr_byte_size = m_arch.GetAddressByteSize();
       const bool show_address = true;
       const bool show_bytes = true;
-      const bool show_control_flow_kind = true;
+      const bool show_control_flow_kind = false;
       m_cfa_reg_info = *m_inst_emulator_up->GetRegisterInfo(
           unwind_plan.GetRegisterKind(), unwind_plan.GetInitialCFARegister());
       m_fp_is_cfa = false;

From 36a2752923a76f0b747bc35b7cd1bd1d1bf5bf05 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 11 Mar 2024 18:21:19 +0100
Subject: [PATCH 14/95] [bazel] Grab correct version info after
 81e20472a0c5a4a8edc5ec38dc345d580681af81

This is a bit awkward.
---
 utils/bazel/configure.bzl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
index 88a576548e169..d6cd6aa0813e4 100644
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -149,6 +149,14 @@ def _llvm_configure_impl(repository_ctx):
         llvm_cmake,
     )
 
+    # Grab version info and merge it with the other vars
+    version = _extract_cmake_settings(
+        repository_ctx,
+        "cmake/Modules/LLVMVersion.cmake",
+    )
+    version = {k: v for k, v in version.items() if v != None}
+    vars.update(version)
+
     _write_dict_to_file(
         repository_ctx,
         filepath = "vars.bzl",

From 866ac9a165d65606910987c119ebee6a85480192 Mon Sep 17 00:00:00 2001
From: annamthomas <anna@azul.com>
Date: Mon, 11 Mar 2024 13:23:00 -0400
Subject: [PATCH 15/95] [LV] Address postcommit review for PR84782 (#84797)

This testcase was added to show miscompile in
https://github.com/llvm/llvm-project/issues/81872
---
 .../Transforms/LoopVectorize/X86/pr81872.ll   | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index c6b1944b20090..14acb6f57aa0c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -3,15 +3,13 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-@global = external global ptr addrspace(1), align 8
-
 ; PR 81872 explains the issue.
 
 ; If we vectorize, we have a miscompile where array IV and thereby value stored in (arr[99],
 ; arr[98]) is calculated incorrectly since disjoint or was only disjoint because
 ; of dominating conditions. Dropping the disjoint to avoid poison still changes
 ; the behaviour since now the or is no longer equivalent to the add.
-; Function Attrs: uwtable
+;
 define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr noundef align 8 dereferenceable_or_null(16) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -45,43 +43,43 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB5:%.*]] ]
-; CHECK-NEXT:    br label [[BB15:%.*]]
-; CHECK:       bb15:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB20:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
 ; CHECK-NEXT:    [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
-; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[BB20]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
 ; CHECK-NEXT:    [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
 ; CHECK-NEXT:    store i64 1, ptr [[GETELEMENTPTR19]], align 8
-; CHECK-NEXT:    br label [[BB20]]
-; CHECK:       bb20:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
 ; CHECK-NEXT:    [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
-; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[BB15]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    ret void
 ;
 bb5:
-  br label %bb15
+  br label %loop.header
 
-bb15:                                             ; preds = %bb20, %bb8
-  %iv = phi i64 [ 99, %bb5 ], [ %iv.next, %bb20 ]
+loop.header:                                             ; preds = %loop.latch, %bb8
+  %iv = phi i64 [ 99, %bb5 ], [ %iv.next, %loop.latch ]
   %and = and i64 %iv, 1
   %icmp17 = icmp eq i64 %and, 0
-  br i1 %icmp17, label %bb18, label %bb20, !prof !21
+  br i1 %icmp17, label %bb18, label %loop.latch, !prof !21
 
-bb18:                                             ; preds = %bb15
+bb18:                                             ; preds = %loop.header
   %or = or disjoint i64 %iv, 1
   %getelementptr19 = getelementptr inbounds i64, ptr %arr, i64 %or
   store i64 1, ptr %getelementptr19, align 8
-  br label %bb20
+  br label %loop.latch
 
-bb20:                                             ; preds = %bb18, %bb15
+loop.latch:                                             ; preds = %bb18, %loop.header
   %iv.next = add nsw i64 %iv, -1
   %icmp22 = icmp eq i64 %iv.next, 90
-  br i1 %icmp22, label %bb6, label %bb15, !prof !22
+  br i1 %icmp22, label %bb6, label %loop.header, !prof !22
 
 bb6:
   ret void

From 8467457afc61d70e881c9817ace26356ef757733 Mon Sep 17 00:00:00 2001
From: Bhuminjay Soni <Soni5Happy@gmail.com>
Date: Mon, 11 Mar 2024 22:55:32 +0530
Subject: [PATCH 16/95] Add new flag -Wreturn-mismatch (#82872)

This pull request fixes #72116 where a new flag is introduced for
compatibility with GCC 14, the functionality of -Wreturn-type is
modified to split some of its behaviors into -Wreturn-mismatch

Fixes #72116
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/Basic/DiagnosticGroups.td |  4 ++-
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 ++--
 clang/test/Misc/warning-wall.c                |  1 +
 clang/test/Sema/return-type-mismatch.c        | 36 +++++++++++++++++++
 5 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Sema/return-type-mismatch.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bce27dc8c4a99..88e552d5c4611 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -183,6 +183,9 @@ Deprecated Compiler Flags
 
 Modified Compiler Flags
 -----------------------
+- Added a new diagnostic flag ``-Wreturn-mismatch`` which is grouped under
+  ``-Wreturn-type``, and moved some of the diagnostics previously controlled by
+  ``-Wreturn-type`` under this new flag. Fixes #GH72116.
 
 Removed Compiler Flags
 -------------------------
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index ba1d4b2352e3d..3f14167d6b846 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -617,7 +617,9 @@ def GNURedeclaredEnum : DiagGroup<"gnu-redeclared-enum">;
 def RedundantMove : DiagGroup<"redundant-move">;
 def Register : DiagGroup<"register", [DeprecatedRegister]>;
 def ReturnTypeCLinkage : DiagGroup<"return-type-c-linkage">;
-def ReturnType : DiagGroup<"return-type", [ReturnTypeCLinkage]>;
+def ReturnMismatch : DiagGroup<"return-mismatch">;
+def ReturnType : DiagGroup<"return-type", [ReturnTypeCLinkage, ReturnMismatch]>;
+
 def BindToTemporaryCopy : DiagGroup<"bind-to-temporary-copy",
                                     [CXX98CompatBindToTemporaryCopy]>;
 def SelfAssignmentField : DiagGroup<"self-assign-field">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9b5245695153e..c54105507753e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10248,14 +10248,14 @@ def warn_second_parameter_to_va_arg_never_compatible : Warning<
 
 def warn_return_missing_expr : Warning<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
-  InGroup<ReturnType>;
+  InGroup<ReturnMismatch>;
 def ext_return_missing_expr : ExtWarn<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
-  InGroup<ReturnType>;
+  InGroup<ReturnMismatch>;
 def ext_return_has_expr : ExtWarn<
   "%select{void function|void method|constructor|destructor}1 %0 "
   "should not return a value">,
-  DefaultError, InGroup<ReturnType>;
+  DefaultError, InGroup<ReturnMismatch>;
 def ext_return_has_void_expr : Extension<
   "void %select{function|method|block}1 %0 should not return void expression">;
 def err_return_init_list : Error<
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index 05a82770e26de..4909ab034ef30 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -44,6 +44,7 @@ CHECK-NEXT:      -Wreorder-ctor
 CHECK-NEXT:      -Wreorder-init-list
 CHECK-NEXT:    -Wreturn-type
 CHECK-NEXT:      -Wreturn-type-c-linkage
+CHECK-NEXT:      -Wreturn-mismatch
 CHECK-NEXT:    -Wself-assign
 CHECK-NEXT:      -Wself-assign-overloaded
 CHECK-NEXT:      -Wself-assign-field
diff --git a/clang/test/Sema/return-type-mismatch.c b/clang/test/Sema/return-type-mismatch.c
new file mode 100644
index 0000000000000..79a625d7df1f5
--- /dev/null
+++ b/clang/test/Sema/return-type-mismatch.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -Wreturn-type -Wno-return-mismatch -fsyntax-only -verify=return-type %s
+// RUN: %clang_cc1 -Wno-return-type -Wreturn-mismatch -fsyntax-only -verify=return-mismatch %s
+
+int foo(void) __attribute__((noreturn));
+int bar(void);
+
+void test1(void) {
+  return 1; // return-mismatch-warning{{void function 'test1' should not return a value}}
+}
+
+int test2(void) { 
+    return; // return-mismatch-warning{{non-void function 'test2' should return a value}}
+} 
+
+int test3(void) { 
+    // return-type-warning@+1 {{non-void function does not return a value}}
+} 
+
+int test4(void) {
+    (void)(bar() || foo()); // return-type-warning@+1 {{non-void function does not return a value in all control paths}}
+} 
+
+void test5(void) {
+} // no-warning
+
+int test6(void) {
+  return 0; // no-warning
+}
+
+int test7(void) {
+  foo(); // no warning
+}
+
+int test8(void) {
+  bar(); // return-type-warning@+1 {{non-void function does not return a value}}
+}

From 034cc2f5d0abcf7a465665246f16a1b75fbde93a Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Mon, 11 Mar 2024 13:47:30 -0400
Subject: [PATCH 17/95] [GISEL] Add G_INSERT_SUBVECTOR and G_EXTRACT_SUBVECTOR
 (#84538)

G_INSERT and G_EXTRACT are not sufficient to use to represent both
INSERT/EXTRACT on a subregister and INSERT/EXTRACT on a vector.

We would like to be able to INSERT/EXTRACT on vectors in cases that
INSERT/EXTRACT on vector subregisters are not sufficient, so we add
these opcodes.

I tried to do a patch where we treated G_EXTRACT as both
G_EXTRACT_SUBVECTOR and G_EXTRACT_SUBREG, but ran into an infinite loop
at this
[point](https://github.com/llvm/llvm-project/blob/8b5b294ec2cf876bc5eb5bd5fcb56ef487e36d60/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L9932)
in the SDAG equivalent code.
---
 llvm/docs/GlobalISel/GenericOpcode.rst        | 35 +++++++
 .../CodeGen/GlobalISel/MachineIRBuilder.h     | 19 ++++
 llvm/include/llvm/Support/TargetOpcodes.def   |  6 ++
 llvm/include/llvm/Target/GenericOpcodes.td    | 14 +++
 .../CodeGen/GlobalISel/MachineIRBuilder.cpp   | 15 +++
 llvm/lib/CodeGen/MachineVerifier.cpp          | 98 +++++++++++++++++++
 .../GlobalISel/legalizer-info-validation.mir  |  6 ++
 .../test_g_extract_subvector.mir              | 31 ++++++
 .../test_g_insert_subvector.mir               | 43 ++++++++
 9 files changed, 267 insertions(+)
 create mode 100644 llvm/test/MachineVerifier/test_g_extract_subvector.mir
 create mode 100644 llvm/test/MachineVerifier/test_g_insert_subvector.mir

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index dda367607d043..f9f9e1186460e 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -607,6 +607,41 @@ See the LLVM LangRef entry on '``llvm.lround.*'`` for details on behaviour.
 Vector Specific Operations
 --------------------------
 
+G_INSERT_SUBVECTOR
+^^^^^^^^^^^^^^^^^^
+
+Insert the second source vector into the first source vector. The index operand
+represents the starting index in the first source vector at which the second
+source vector should be inserted into.
+
+The index must be a constant multiple of the second source vector's minimum
+vector length. If the vectors are scalable, then the index is first scaled by
+the runtime scaling factor. The indices inserted in the source vector must be
+valid indicies of that vector. If this condition cannot be determined statically
+but is false at runtime, then the result vector is undefined.
+
+.. code-block:: none
+
+  %2:_(<vscale x 4 x i64>) = G_INSERT_SUBVECTOR %0:_(<vscale x 4 x i64>), %1:_(<vscale x 2 x i64>), 0
+
+G_EXTRACT_SUBVECTOR
+^^^^^^^^^^^^^^^^^^^
+
+Extract a vector of destination type from the source vector. The index operand
+represents the starting index from which a subvector is extracted from
+the source vector.
+
+The index must be a constant multiple of the source vector's minimum vector
+length. If the source vector is a scalable vector, then the index is first
+scaled by the runtime scaling factor. The indices extracted from the source
+vector must be valid indicies of that vector. If this condition cannot be
+determined statically but is false at runtime, then the result vector is
+undefined.
+
+.. code-block:: none
+
+  %3:_(<vscale x 4 x i64>) = G_EXTRACT_SUBVECTOR %2:_(<vscale x 8 x i64>), 2
+
 G_CONCAT_VECTORS
 ^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 6762b1b360d5e..4732eaf4ee27c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1121,6 +1121,25 @@ class MachineIRBuilder {
   MachineInstrBuilder buildConcatVectors(const DstOp &Res,
                                          ArrayRef<Register> Ops);
 
+  /// Build and insert `Res = G_INSERT_SUBVECTOR Src0, Src1, Idx`.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Src0, and \p Src1 must be generic virtual registers with
+  /// vector type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildInsertSubvector(const DstOp &Res, const SrcOp &Src0,
+                                           const SrcOp &Src1, unsigned Index);
+
+  /// Build and insert `Res = G_EXTRACT_SUBVECTOR Src, Idx0`.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res and \p Src must be generic virtual registers with vector type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildExtractSubvector(const DstOp &Res, const SrcOp &Src,
+                                            unsigned Index);
+
   MachineInstrBuilder buildInsert(const DstOp &Res, const SrcOp &Src,
                                   const SrcOp &Op, unsigned Index);
 
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 94fba491148b2..3dade14f043b6 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -727,6 +727,12 @@ HANDLE_TARGET_OPCODE(G_BR)
 /// Generic branch to jump table entry.
 HANDLE_TARGET_OPCODE(G_BRJT)
 
+/// Generic insert subvector.
+HANDLE_TARGET_OPCODE(G_INSERT_SUBVECTOR)
+
+/// Generic extract subvector.
+HANDLE_TARGET_OPCODE(G_EXTRACT_SUBVECTOR)
+
 /// Generic insertelement.
 HANDLE_TARGET_OPCODE(G_INSERT_VECTOR_ELT)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index d967885aa2d75..8dc84fb0ba052 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1426,6 +1426,20 @@ def G_WRITE_REGISTER : GenericInstruction {
 // Vector ops
 //------------------------------------------------------------------------------
 
+// Generic insert subvector.
+def G_INSERT_SUBVECTOR : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type1:$src1, untyped_imm_0:$idx);
+  let hasSideEffects = false;
+}
+
+// Generic extract subvector.
+def G_EXTRACT_SUBVECTOR : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$idx);
+  let hasSideEffects = false;
+}
+
 // Generic insertelement.
 def G_INSERT_VECTOR_ELT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 28e5bf85ca9ce..9b12d443c96e9 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -877,6 +877,21 @@ MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst,
   return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildInsertSubvector(const DstOp &Res,
+                                                           const SrcOp &Src0,
+                                                           const SrcOp &Src1,
+                                                           unsigned Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_SUBVECTOR, Res,
+                    {Src0, Src1, uint64_t(Idx)});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtractSubvector(const DstOp &Res,
+                                                            const SrcOp &Src,
+                                                            unsigned Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_SUBVECTOR, Res,
+                    {Src, uint64_t(Idx)});
+}
+
 MachineInstrBuilder
 MachineIRBuilder::buildInsertVectorElement(const DstOp &Res, const SrcOp &Val,
                                            const SrcOp &Elt, const SrcOp &Idx) {
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 9003f1dded87a..90cbf097370de 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1613,6 +1613,104 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_BSWAP size must be a multiple of 16 bits", MI);
     break;
   }
+  case TargetOpcode::G_INSERT_SUBVECTOR: {
+    const MachineOperand &Src0Op = MI->getOperand(1);
+    if (!Src0Op.isReg()) {
+      report("G_INSERT_SUBVECTOR first source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &Src1Op = MI->getOperand(2);
+    if (!Src1Op.isReg()) {
+      report("G_INSERT_SUBVECTOR second source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &IndexOp = MI->getOperand(3);
+    if (!IndexOp.isImm()) {
+      report("G_INSERT_SUBVECTOR index must be an immediate", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src0Ty = MRI->getType(Src0Op.getReg());
+    LLT Src1Ty = MRI->getType(Src1Op.getReg());
+
+    if (!DstTy.isVector()) {
+      report("Destination type must be a vector", MI);
+      break;
+    }
+
+    if (!Src0Ty.isVector()) {
+      report("First source must be a vector", MI);
+      break;
+    }
+
+    if (!Src1Ty.isVector()) {
+      report("Second source must be a vector", MI);
+      break;
+    }
+
+    if (DstTy != Src0Ty) {
+      report("Destination type must match the first source vector type", MI);
+      break;
+    }
+
+    if (Src0Ty.getElementType() != Src1Ty.getElementType()) {
+      report("Element type of source vectors must be the same", MI);
+      break;
+    }
+
+    if (IndexOp.getImm() != 0 &&
+        Src1Ty.getElementCount().getKnownMinValue() % IndexOp.getImm() != 0) {
+      report("Index must be a multiple of the second source vector's "
+             "minimum vector length",
+             MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_EXTRACT_SUBVECTOR: {
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    if (!SrcOp.isReg()) {
+      report("G_EXTRACT_SUBVECTOR first source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &IndexOp = MI->getOperand(2);
+    if (!IndexOp.isImm()) {
+      report("G_EXTRACT_SUBVECTOR index must be an immediate", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(SrcOp.getReg());
+
+    if (!DstTy.isVector()) {
+      report("Destination type must be a vector", MI);
+      break;
+    }
+
+    if (!SrcTy.isVector()) {
+      report("First source must be a vector", MI);
+      break;
+    }
+
+    if (DstTy.getElementType() != SrcTy.getElementType()) {
+      report("Element type of vectors must be the same", MI);
+      break;
+    }
+
+    if (IndexOp.getImm() != 0 &&
+        SrcTy.getElementCount().getKnownMinValue() % IndexOp.getImm() != 0) {
+      report("Index must be a multiple of the source vector's minimum vector "
+             "length",
+             MI);
+      break;
+    }
+
+    break;
+  }
   case TargetOpcode::G_SHUFFLE_VECTOR: {
     const MachineOperand &MaskOp = MI->getOperand(3);
     if (!MaskOp.isShuffleMask()) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ecad3f1151348..ac330918b430a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -616,6 +616,12 @@
 # DEBUG-NEXT: G_BRJT (opcode {{[0-9]+}}): 2 type indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_INSERT_SUBVECTOR (opcode {{[0-9]+}}): 2 type indices, 1 imm index
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_EXTRACT_SUBVECTOR (opcode {{[0-9]+}}): 1 type index, 1 imm index
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/MachineVerifier/test_g_extract_subvector.mir b/llvm/test/MachineVerifier/test_g_extract_subvector.mir
new file mode 100644
index 0000000000000..bc167d2eb7bcd
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_extract_subvector.mir
@@ -0,0 +1,31 @@
+# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+---
+name:            g_extract_subvector
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: G_EXTRACT_SUBVECTOR first source must be a register
+    %3:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR 1, 0
+
+    ; CHECK: G_EXTRACT_SUBVECTOR index must be an immediate
+    %4:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR %2, %0
+
+    ; CHECK: Destination type must be a vector
+    %5:_(s32) = G_EXTRACT_SUBVECTOR %2, 0
+
+    ; CHECK: First source must be a vector
+    %6:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR %0, 0
+
+    %7:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+
+    ; CHECK: Element type of vectors must be the same
+    %8:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR %7, 0
+
+    ; CHECK: Index must be a multiple of the source vector's minimum vector length
+    %9:_(<vscale x 4 x s32>) = G_EXTRACT_SUBVECTOR  %1, 3
+...
diff --git a/llvm/test/MachineVerifier/test_g_insert_subvector.mir b/llvm/test/MachineVerifier/test_g_insert_subvector.mir
new file mode 100644
index 0000000000000..dce30cdb6b1e5
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_insert_subvector.mir
@@ -0,0 +1,43 @@
+# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+
+---
+name:            g_splat_vector
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: G_INSERT_SUBVECTOR first source must be a register
+    %3:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR 1, %2, 0
+
+    ; CHECK: G_INSERT_SUBVECTOR second source must be a register
+    %4:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, 1, 0
+
+    ; CHECK: G_INSERT_SUBVECTOR index must be an immediate
+    %5:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %2, %0
+
+    ; CHECK: Destination type must be a vector
+    %6:_(s32) = G_INSERT_SUBVECTOR %1, %2, 0
+
+    ; CHECK: First source must be a vector
+    %7:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %0, %2, 0
+
+    ; CHECK: Second source must be a vector
+    %8:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %0, 0
+
+    ; CHECK: Destination type must match the first source vector type
+    %9:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %2, %1, 0
+
+    %10:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+
+    ; CHECK: Element type of source vectors must be the same
+    %11:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %10, 0
+
+    %12:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: Index must be a multiple of the second source vector's minimum vector length
+    %13:_(<vscale x 4 x s32>) = G_INSERT_SUBVECTOR %12, %1, 3
+...

From 2a3f27cce8983e5d6871b9ebb8f5e9dd91884f0c Mon Sep 17 00:00:00 2001
From: Joe Nash <Sisyph@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:58:45 -0400
Subject: [PATCH 18/95] [AMDGPU][True16] Make NotHasTrue16BitInsts a
 True16Predicate (#84771)

NFC.
Test coverage on VOPC shows NotHasTrue16BitInsts on the pre-gfx11
instructions is necessary (we cannot use the default NoTrue16Predicate).
Update the VOP2 instructions in the same manner.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td           |  2 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td | 12 ++++++------
 llvm/lib/Target/AMDGPU/VOPCInstructions.td | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7183148e13103..c877658cd38e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1903,7 +1903,7 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
 
 def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
-def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
+def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">;
 
 // Control use of True16 instructions. The real True16 instructions are
 // True16 instructions as they are defined in the ISA. Fake True16
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8a92aa8228f12..f136a434971c8 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -199,7 +199,7 @@ multiclass VOP2Inst_t16<string opName,
                         SDPatternOperator node = null_frag,
                         string revOp = opName,
                         bit GFX9Renamed = 0> {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
   }
   let SubtargetPredicate = UseRealTrue16Insts in {
@@ -219,7 +219,7 @@ multiclass VOP2Inst_e64_t16<string opName,
                         SDPatternOperator node = null_frag,
                         string revOp = opName,
                         bit GFX9Renamed = 0> {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
   }
   let SubtargetPredicate = HasTrue16BitInsts in {
@@ -900,7 +900,7 @@ def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
 
 let isReMaterializable = 1 in {
 let FPDPRounding = 1 in {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in
     defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
   let SubtargetPredicate = HasTrue16BitInsts in
     defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
@@ -950,7 +950,7 @@ let SubtargetPredicate = isGFX11Plus in {
 } // End SubtargetPredicate = isGFX11Plus
 
 let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
@@ -958,7 +958,7 @@ def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">
 }
 
 let isCommutable = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
@@ -971,7 +971,7 @@ let Constraints = "$vdst = $src2",
     DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index e5e82447d55fb..022fb7cb67754 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -408,7 +408,7 @@ def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
 
 multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
                      string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -424,7 +424,7 @@ multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string r
 
 multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
                      string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -439,7 +439,7 @@ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string r
   VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Pseudos <opName, VOPC_I1_F16_F16, VOPC_F16_F16, COND_NULL, revOp>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -454,7 +454,7 @@ multiclass VOPCX_F64 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
 
 multiclass VOPCX_I16<string opName, string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Pseudos <opName, VOPC_I1_I16_I16, VOPC_I16_I16, COND_NULL, revOp>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -940,7 +940,7 @@ def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>;
 def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>;
 
 multiclass VOPC_CLASS_F16 <string opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F16_I16, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -949,7 +949,7 @@ multiclass VOPC_CLASS_F16 <string opName> {
 }
 
 multiclass VOPCX_CLASS_F16 <string opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Class_Pseudos <opName, VOPC_I1_F16_I16, VOPC_F16_I16>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {

From 725a0523a18ef1a75a6d4a010dc3debe1b08c9d1 Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Mon, 11 Mar 2024 11:14:17 -0700
Subject: [PATCH 19/95] [Headers][X86] Add specific results to comparisons
 (#83316)

Some comparison intrinsics were described as returning the "result" without
specifying how. The "cmp" intrinsics return zero or all 1's in the
corresponding elements of a returned vector; the "com" intrinsics return
an integer 0 or 1.

Also removed some redundant information.
---
 clang/lib/Headers/emmintrin.h | 114 +++++++++----------
 clang/lib/Headers/smmintrin.h |   4 +
 clang/lib/Headers/xmmintrin.h | 202 ++++++++++++++++++++--------------
 3 files changed, 177 insertions(+), 143 deletions(-)

diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index ebe295f160b2a..984f0cf917e99 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -410,8 +410,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
 }
 
 /// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
-///    for false, 0xFFFFFFFFFFFFFFFF for true.
+///    128-bit vectors of [2 x double] for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -429,8 +430,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
 
 /// Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are less than those in the second operand. Each comparison
-///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    operand are less than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -949,8 +951,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -962,8 +964,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
@@ -974,8 +975,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -987,8 +988,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
@@ -999,8 +999,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1012,8 +1012,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
@@ -1024,8 +1023,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1037,8 +1036,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
@@ -1049,8 +1047,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1062,8 +1060,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
@@ -1074,7 +1071,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
+///    The comparison returns 0 for false, 1 for true. If either of the two
 ///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
@@ -1087,18 +1084,17 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 }
 
 /// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0 for false, 1 for true.
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    If either of the two lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1110,8 +1106,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
@@ -1122,8 +1117,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1135,8 +1130,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
@@ -1147,8 +1141,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1160,8 +1154,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
@@ -1172,8 +1165,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1185,8 +1178,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
@@ -1197,8 +1189,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.  If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1210,8 +1202,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
@@ -1222,8 +1213,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1235,8 +1226,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison result. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+/// \returns An integer containing the comparison result.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
                                                          __m128d __b) {
   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
@@ -3023,8 +3013,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
 }
 
 /// Compares each of the corresponding 8-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
-///    for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3041,8 +3032,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
 }
 
 /// Compares each of the corresponding 16-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false,
-///    0xFFFF for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3059,8 +3051,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
 }
 
 /// Compares each of the corresponding 32-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false,
-///    0xFFFFFFFF for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3078,8 +3071,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
 
 /// Compares each of the corresponding signed 8-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are
-///    greater than those in the second operand. Each comparison yields 0x0 for
-///    false, 0xFF for true.
+///    greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c52ffb77e33d5..9fb9cc9b01348 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1188,6 +1188,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
 /// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
@@ -2301,6 +2303,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 1f5993e0c368d..8e386a72cde78 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -474,7 +474,9 @@ _mm_xor_ps(__m128 __a, __m128 __b)
 }
 
 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison in the
+///    operands for equality.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@@ -498,6 +500,8 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for equality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
@@ -515,8 +519,10 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than the
-///    corresponding value in the second operand and returns the result of the
-///    comparison in the low-order bits of a vector of [4 x float].
+///    corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -540,6 +546,8 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@@ -557,9 +565,10 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than or
-///    equal to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    equal to the corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
+///    the low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -583,6 +592,8 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@@ -600,8 +611,10 @@ _mm_cmple_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -627,6 +640,8 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@@ -644,9 +659,10 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -672,6 +688,8 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@@ -687,8 +705,10 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands for inequality and returns the result of the comparison in the
+/// Compares two 32-bit float values in the low-order bits of both operands
+///    for inequality.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@@ -713,6 +733,8 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for inequality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
@@ -731,8 +753,10 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -757,6 +781,8 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@@ -775,9 +801,10 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -802,6 +829,8 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@@ -820,9 +849,10 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    than the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -849,6 +879,8 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@@ -867,9 +899,10 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than or equal to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    than or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -896,6 +929,8 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@@ -914,9 +949,10 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is ordered with
-///    respect to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    respect to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -941,6 +977,8 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
@@ -959,9 +997,10 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is unordered
-///    with respect to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    with respect to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -986,6 +1025,8 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
@@ -1003,9 +1044,10 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 }
 
 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison.
+///    operands for equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1018,8 +1060,7 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///    two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
@@ -1028,9 +1069,10 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1043,8 +1085,7 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
@@ -1053,9 +1094,10 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than or equal to the
-///    second operand and returns the result of the comparison.
+///    second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1067,8 +1109,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
@@ -1077,9 +1118,10 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1091,8 +1133,7 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
@@ -1101,9 +1142,10 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than or equal to
-///    the second operand and returns the result of the comparison.
+///    the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1115,8 +1157,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
@@ -1125,9 +1166,10 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is not equal to the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1139,8 +1181,7 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
@@ -1148,10 +1189,10 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 }
 
 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine equality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1163,8 +1204,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
@@ -1173,9 +1213,10 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than the second operand and returns the result of the comparison.
+///    less than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1187,8 +1228,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
@@ -1197,10 +1237,10 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than or equal to the second operand and returns the result of the
-///    comparison.
+///    less than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1212,8 +1252,7 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
@@ -1222,10 +1261,10 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than the second operand and returns the result of the
-///    comparison.
+///    greater than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1237,8 +1276,7 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
@@ -1247,10 +1285,10 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than or equal to the second operand and returns the result of
-///    the comparison.
+///    greater than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1262,8 +1300,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
@@ -1271,10 +1308,10 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 }
 
 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine inequality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine inequality.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1286,8 +1323,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {

From 212604698c0f265702ec9c9486fe5b74a6fc2ff7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 11 Mar 2024 18:25:49 +0000
Subject: [PATCH 20/95] [AMDGPU] Add missing tests for GFX10 (t)buffer format
 d16 instructions (#84789)

---
 llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s         | 24 +++++++++++++++++++
 llvm/test/MC/AMDGPU/mtbuf-gfx10.s             |  6 +++++
 .../MC/Disassembler/AMDGPU/gfx10_mtbuf.txt    |  6 +++++
 .../MC/Disassembler/AMDGPU/gfx10_mubuf.txt    | 24 +++++++++++++++++++
 4 files changed, 60 insertions(+)

diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
index 99c9c4aee4a76..aacdfcb4e871e 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
@@ -5,6 +5,18 @@
 // ENC_MUBUF.
 //===----------------------------------------------------------------------===//
 
+buffer_load_format_d16_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xy v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
+
 buffer_load_format_x v5, off, s[8:11], s3 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
@@ -221,6 +233,18 @@ buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 dlc
 buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 glc slc dlc
 // GFX10: encoding: [0xff,0xcf,0x0c,0xe0,0x00,0x05,0x42,0x03]
 
+buffer_store_format_d16_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xy v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
+
 buffer_store_format_x v1, off, s[12:15], s4 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
diff --git a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
index f235280874c4a..56add346bd21f 100644
--- a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
+++ b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
@@ -11,6 +11,9 @@ tbuffer_load_format_d16_x v0, off, s[0:3], format:22, 0
 // GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80]
 tbuffer_load_format_d16_xy v0, off, s[0:3], format:22, 0
 
+// GFX10: tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb2,0xe8,0x00,0x00,0x20,0x80]
+tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], format:22, 0
+
 // GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80]
 tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], format:22, 0
 
@@ -62,6 +65,9 @@ tbuffer_store_format_d16_x v0, v1, s[4:7], format:33, 0 idxen
 // GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80]
 tbuffer_store_format_d16_xy v0, v1, s[4:7], format:33, 0 idxen
 
+// GFX10: tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0e,0xe9,0x02,0x00,0x21,0x80]
+tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], format:33, 0 idxen
+
 // GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80]
 tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], format:33, 0 idxen
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
index 950ce783baba2..b6232e84549ba 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
@@ -6,6 +6,9 @@
 # GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
 0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80
 
+# GFX10: tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
+0x00,0x00,0xb2,0xe8,0x00,0x00,0x20,0x80
+
 # GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
 0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80
 
@@ -57,6 +60,9 @@
 # GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80
 
+# GFX10: tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
+0x00,0x20,0x0e,0xe9,0x02,0x00,0x21,0x80
+
 # GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
index 6fbe77e43ad42..b0731be4484c7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
@@ -1316,6 +1316,18 @@
 # GFX10: buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x05,0x02,0x03]
 0xff,0x1f,0x38,0xe0,0x00,0x05,0x02,0x03
 
+# GFX10: buffer_load_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_load_format_x v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03]
 0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03
 
@@ -2015,6 +2027,18 @@
 # GFX10: buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x03,0x04]
 0xff,0x0f,0x78,0xe0,0x00,0xfc,0x03,0x04
 
+# GFX10: buffer_store_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_store_format_x v1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1]
 0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1
 

From 23be73208d63898611b81d4b93a0c254a40c879c Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Mon, 11 Mar 2024 11:34:58 -0700
Subject: [PATCH 21/95] AMDGPU: Add an argument to DS_Real_gfx12 to disable
 alias, NFC (#84717)

This is for cased that we simply want to rename from ps.Mnemonic, but
ps.Mnemonic itself is not supported as an alias.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index a84227ebf506f..cc763df5a4760 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1210,13 +1210,13 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
 // GFX12.
 //===----------------------------------------------------------------------===//
 
-multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
+multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME), bit needAlias = true> {
   defvar ps = !cast<DS_Pseudo>(NAME);
   let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
                                                name, /*hasGDS=*/false>;
-  if !ne(ps.Mnemonic, name) then
+  if !and(needAlias, !ne(ps.Mnemonic, name)) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
 

From 5b4c35064760816e4c29921df8f7ff4f2621d4f9 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 11 Mar 2024 13:39:47 -0500
Subject: [PATCH 22/95] [flang][unittests] Fix buffer underrun in
 LengthWithoutTrailingSpaces (#84382)

Account for the descriptor containing a zero-length string. Also, avoid
iterating backwards too far.

This was detected by address sanitizer.
---
 flang/runtime/command.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/runtime/command.cpp b/flang/runtime/command.cpp
index 7c44890545bd3..fabfe601688bb 100644
--- a/flang/runtime/command.cpp
+++ b/flang/runtime/command.cpp
@@ -196,11 +196,11 @@ std::int32_t RTNAME(GetCommand)(const Descriptor *value,
 }
 
 static std::size_t LengthWithoutTrailingSpaces(const Descriptor &d) {
-  std::size_t s{d.ElementBytes() - 1};
-  while (*d.OffsetElement(s) == ' ') {
+  std::size_t s{d.ElementBytes()}; // This can be 0.
+  while (s != 0 && *d.OffsetElement(s - 1) == ' ') {
     --s;
   }
-  return s + 1;
+  return s;
 }
 
 std::int32_t RTNAME(GetEnvVariable)(const Descriptor &name,

From 8846b91e15d4c8d280ee727c0f69b958f9b1440b Mon Sep 17 00:00:00 2001
From: Jeff Niu <jeff@modular.com>
Date: Mon, 11 Mar 2024 11:44:11 -0700
Subject: [PATCH 23/95] Revert "[CMake][LIT] Add option to run lit testsuites
 in parallel" (#84813)

Reverts llvm/llvm-project#82899

Per the discussion on the PR, this needs more design and justification.
---
 llvm/CMakeLists.txt              |  2 --
 llvm/cmake/modules/AddLLVM.cmake | 17 +++++------------
 llvm/docs/CMake.rst              |  6 ------
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index d9a17a869acfa..bd141619d03fd 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -712,8 +712,6 @@ if(LLVM_INDIVIDUAL_TEST_COVERAGE)
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
-option(LLVM_PARALLEL_LIT "Enable multiple lit suites to run in parallel" OFF)
-
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 828de4bd9940d..374f5e085d911 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1947,18 +1947,11 @@ function(add_lit_target target comment)
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
   if (ARG_UNPARSED_ARGUMENTS)
-    if (LLVM_PARALLEL_LIT)
-     add_custom_target(${target}
-       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-       COMMENT "${comment}"
-       )
-    else()
-     add_custom_target(${target}
-       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-       COMMENT "${comment}"
-       USES_TERMINAL
-       )
-    endif()
+    add_custom_target(${target}
+      COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+      COMMENT "${comment}"
+      USES_TERMINAL
+      )
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index be5da5652e31e..1490b38feb1eb 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,12 +762,6 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
-**LLVM_PARALLEL_LIT**:BOOL
-  Defaults to ``OFF``. If set to ``OFF``, lit testsuites will be configured
-  with CMake's ``USES_TERMINAL`` flag to give direct access to the terminal. If
-  set to ``ON``, that flag will be removed allowing Ninja to schedule multiple
-  lit testsuites in parallel.
-
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
   Value has to be in MB, overwrites LLVM_PARALLEL_COMPILE_JOBS. Compile jobs 

From b4e0890458043ef486fdecba9aad65799ec0ab35 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 11 Mar 2024 11:46:45 -0700
Subject: [PATCH 24/95] [NFC] [scudo] move static_assert closer to class it
 relates to (#84257)

delete other static_assert
---
 compiler-rt/lib/scudo/standalone/combined.h    | 10 ----------
 compiler-rt/lib/scudo/standalone/stack_depot.h |  4 ++++
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 069b5f64475db..4dacfac707926 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1553,16 +1553,6 @@ class Allocator {
     constexpr u32 kFramesPerStack = 16;
     static_assert(isPowerOfTwo(kFramesPerStack));
 
-    // We need StackDepot to be aligned to 8-bytes so the ring we store after
-    // is correctly assigned.
-    static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
-
-    // Make sure the maximum sized StackDepot fits withint a uintptr_t to
-    // simplify the overflow checking.
-    static_assert(sizeof(StackDepot) + UINT32_MAX * sizeof(atomic_u64) *
-                                           UINT32_MAX * sizeof(atomic_u32) <
-                  UINTPTR_MAX);
-
     if (AllocationRingBufferSize > kMaxU32Pow2 / kStacksPerRingBufferEntry)
       return;
     u32 TabSize = static_cast<u32>(roundUpPowerOfTwo(kStacksPerRingBufferEntry *
diff --git a/compiler-rt/lib/scudo/standalone/stack_depot.h b/compiler-rt/lib/scudo/standalone/stack_depot.h
index 620137e44f372..cf3cabf7085b6 100644
--- a/compiler-rt/lib/scudo/standalone/stack_depot.h
+++ b/compiler-rt/lib/scudo/standalone/stack_depot.h
@@ -199,6 +199,10 @@ class alignas(atomic_u64) StackDepot {
   void enable() NO_THREAD_SAFETY_ANALYSIS { RingEndMu.unlock(); }
 };
 
+// We need StackDepot to be aligned to 8-bytes so the ring we store after
+// is correctly assigned.
+static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
+
 } // namespace scudo
 
 #endif // SCUDO_STACK_DEPOT_H_

From a8eb2f0dabacb334cbfc78eaffde9a75b1ba64a4 Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Mon, 11 Mar 2024 18:47:30 +0000
Subject: [PATCH 25/95] [Clang][AST] Print attributes of Obj-C interfaces

When pretty printing an Objective-C interface declaration, Clang
previously didn't print any attributes that are applied to the
declaration.
---
 clang/lib/AST/DeclPrinter.cpp         | 5 +++++
 clang/test/AST/ast-print-objectivec.m | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 43d221968ea3f..b701581b2474a 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1517,6 +1517,11 @@ void DeclPrinter::VisitObjCInterfaceDecl(ObjCInterfaceDecl *OID) {
     return;
   }
   bool eolnOut = false;
+  if (OID->hasAttrs()) {
+    prettyPrintAttributes(OID);
+    Out << "\n";
+  }
+
   Out << "@interface " << I;
 
   if (auto TypeParams = OID->getTypeParamListAsWritten()) {
diff --git a/clang/test/AST/ast-print-objectivec.m b/clang/test/AST/ast-print-objectivec.m
index 05a0a5d4aa74c..a0652f38e713f 100644
--- a/clang/test/AST/ast-print-objectivec.m
+++ b/clang/test/AST/ast-print-objectivec.m
@@ -21,6 +21,10 @@ - (void)MethI __attribute__((availability(macosx,introduced=10.1.0,deprecated=10
 - (void)methodWithArg:(int)x andAnotherOne:(int)y { }
 @end
 
+__attribute__((availability(macosx,introduced=10.1.0,deprecated=10.2)))
+@interface InterfaceWithAttribute
+@end
+
 // CHECK: @protocol P
 // CHECK: - (void)MethP __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)));
 // CHECK: @end
@@ -45,6 +49,10 @@ - (void)methodWithArg:(int)x andAnotherOne:(int)y { }
 
 // CHECK: @end
 
+// CHECK: __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)))
+// CHECK: @interface InterfaceWithAttribute
+// CHECK: @end
+
 @class C1;
 struct __attribute__((objc_bridge_related(C1,,))) S1;
 

From 337a20071518d647a0d453f93055817131aa15e9 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 11 Mar 2024 11:47:59 -0700
Subject: [PATCH 26/95] [NFC] [scudo] Move static_assert to class it concerns
 (#84245)

---
 compiler-rt/lib/scudo/standalone/combined.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 4dacfac707926..9e1fd6d6dca3c 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1081,6 +1081,11 @@ class Allocator {
     // An array of Size (at least one) elements of type Entry is immediately
     // following to this struct.
   };
+  static_assert(sizeof(AllocationRingBuffer) %
+                        alignof(typename AllocationRingBuffer::Entry) ==
+                    0,
+                "invalid alignment");
+
   // Pointer to memory mapped area starting with AllocationRingBuffer struct,
   // and immediately followed by Size elements of type Entry.
   atomic_uptr RingBufferAddress = {};
@@ -1585,10 +1590,6 @@ class Allocator {
 
     atomic_store(&RingBufferAddress, reinterpret_cast<uptr>(RB),
                  memory_order_release);
-    static_assert(sizeof(AllocationRingBuffer) %
-                          alignof(typename AllocationRingBuffer::Entry) ==
-                      0,
-                  "invalid alignment");
   }
 
   void unmapRingBuffer() {

From 08a9207f947b8b022d70f8ee7eeeda7acc6aac76 Mon Sep 17 00:00:00 2001
From: Usama Hameed <u_hameed@apple.com>
Date: Mon, 11 Mar 2024 11:57:53 -0700
Subject: [PATCH 27/95] [LLDB] ASanLibsanitizers Use
 `sanitizers_address_on_report` breakpoint (#84583)

symbol

This patch puts the default breakpoint on the
sanitizers_address_on_report symbol, and uses the old symbol as a backup
if the default case is not found

rdar://123911522
---
 .../InstrumentationRuntimeASanLibsanitizers.cpp       | 11 +++++++++--
 .../Utility/ReportRetriever.cpp                       |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
index d84cd36d7ce17..cd91f4a6ff1bc 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
@@ -90,9 +90,16 @@ void InstrumentationRuntimeASanLibsanitizers::Activate() {
   if (!process_sp)
     return;
 
+  lldb::ModuleSP module_sp = GetRuntimeModuleSP();
+
   Breakpoint *breakpoint = ReportRetriever::SetupBreakpoint(
-      GetRuntimeModuleSP(), process_sp,
-      ConstString("_Z22raise_sanitizers_error23sanitizer_error_context"));
+      module_sp, process_sp, ConstString("sanitizers_address_on_report"));
+
+  if (!breakpoint) {
+    breakpoint = ReportRetriever::SetupBreakpoint(
+        module_sp, process_sp,
+        ConstString("_Z22raise_sanitizers_error23sanitizer_error_context"));
+  }
 
   if (!breakpoint)
     return;
diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
index ff58c4cababae..298b63bc716fc 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
@@ -219,6 +219,7 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp,
   return true; // Return true to stop the target
 }
 
+// FIXME: Setup the breakpoint using a less fragile SPI. rdar://124399066
 Breakpoint *ReportRetriever::SetupBreakpoint(ModuleSP module_sp,
                                              ProcessSP process_sp,
                                              ConstString symbol_name) {

From eaa71a97f9155ea9df33141ef2fb369dc8fc464f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 11 Mar 2024 12:07:28 -0700
Subject: [PATCH 28/95] [clang] Add optional pass to remove UBSAN traps using
 PGO (#84214)

With #83471 it reduces UBSAN overhead from 44% to 6%.
Measured as "Geomean difference" on "test-suite/MultiSource/Benchmarks"
with PGO build.

On real large server binary we see 95% of code is still instrumented,
with 10% -> 1.5% UBSAN overhead improvements. We can pass this test only
with subset of UBSAN, so base overhead is smaller.

We have followup patches to improve it even further.
---
 clang/lib/CodeGen/BackendUtil.cpp | 21 +++++++++++++++++++++
 clang/test/CodeGen/remote-traps.c | 15 +++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 clang/test/CodeGen/remote-traps.c

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7310e3817c79a..82b30b8d81562 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -76,6 +76,7 @@
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/RemoveTrapsPass.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
@@ -83,6 +84,7 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -98,6 +100,10 @@ using namespace llvm;
 namespace llvm {
 extern cl::opt<bool> PrintPipelinePasses;
 
+cl::opt<bool> ClRemoveTraps("clang-remove-traps", cl::Optional,
+                            cl::desc("Insert remove-traps pass."),
+                            cl::init(false));
+
 // Experiment to move sanitizers earlier.
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
@@ -744,6 +750,21 @@ static void addSanitizers(const Triple &TargetTriple,
     // LastEP does not need GlobalsAA.
     PB.registerOptimizerLastEPCallback(SanitizersCallback);
   }
+
+  if (ClRemoveTraps) {
+    // We can optimize after inliner, and PGO profile matching. The hook below
+    // is called at the end `buildFunctionSimplificationPipeline`, which called
+    // from `buildInlinerPipeline`, which called after profile matching.
+    PB.registerScalarOptimizerLateEPCallback(
+        [](FunctionPassManager &FPM, OptimizationLevel Level) {
+          // RemoveTrapsPass expects trap blocks preceded by conditional
+          // branches, which usually is not the case without SimplifyCFG.
+          // TODO: Remove `SimplifyCFGPass` after switching to dedicated
+          // intrinsic.
+          FPM.addPass(SimplifyCFGPass());
+          FPM.addPass(RemoveTrapsPass());
+        });
+  }
 }
 
 void EmitAssemblyHelper::RunOptimizationPipeline(
diff --git a/clang/test/CodeGen/remote-traps.c b/clang/test/CodeGen/remote-traps.c
new file mode 100644
index 0000000000000..f053d1bd157f8
--- /dev/null
+++ b/clang/test/CodeGen/remote-traps.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow %s -o - | FileCheck %s 
+// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow -mllvm -clang-remove-traps -mllvm -remove-traps-random-rate=1 %s -o - | FileCheck %s --implicit-check-not="call void @llvm.ubsantrap" --check-prefixes=REMOVE
+
+int f(int x) {
+  return x + 123;
+}
+
+// CHECK-LABEL: define dso_local noundef i32 @f(
+// CHECK: call { i32, i1 } @llvm.sadd.with.overflow.i32(
+// CHECK: trap:
+// CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
+// CHECK-NEXT: unreachable
+
+// REMOVE-LABEL: define dso_local noundef i32 @f(
+// REMOVE: call { i32, i1 } @llvm.sadd.with.overflow.i32(

From d1d80cc3197faa4194cddcc79ff704b7d4c5b9e4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 11 Mar 2024 14:09:59 -0500
Subject: [PATCH 29/95] [HIP] Make the new driver bundle outputs for
 device-only (#84534)

Summary:
The current behavior of HIP is that when --offload-device-only is set it
still bundles the outputs into a fat binary. Even though this is
different from how all the other targets handle this, it seems to be
dependned on by some tooling so just make it backwards compatible for
the `-fno-gpu-rdc` case.
---
 clang/lib/Driver/Driver.cpp       | 11 ++++++++++-
 clang/test/Driver/hip-binding.hip | 13 +++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index fce43430a9137..190782a79a245 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4638,7 +4638,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  if (offloadDeviceOnly())
+  // All kinds exit now in device-only mode except for non-RDC mode HIP.
+  if (offloadDeviceOnly() &&
+      (!C.isOffloadingHostKind(Action::OFK_HIP) ||
+       !Args.hasFlag(options::OPT_gpu_bundle_output,
+                     options::OPT_no_gpu_bundle_output, true) ||
+       Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)))
     return C.MakeAction<OffloadAction>(DDeps, types::TY_Nothing);
 
   if (OffloadActions.empty())
@@ -4671,6 +4676,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
              nullptr, C.getActiveOffloadKinds());
   }
 
+  // HIP wants '--offload-device-only' to create a fatbinary by default.
+  if (offloadDeviceOnly())
+    return C.MakeAction<OffloadAction>(DDep, types::TY_Nothing);
+
   // If we are unable to embed a single device output into the host, we need to
   // add each device output as a host dependency to ensure they are still built.
   bool SingleDeviceOutput = !llvm::any_of(OffloadActions, [](Action *A) {
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 79ec2039edb74..c116ad80a8ad8 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -65,9 +65,18 @@
 // MULTI-D-ONLY-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX90a]]"], output: "[[GFX90a_OUT:.+]]"
 //
 // RUN: not %clang -### --target=x86_64-linux-gnu --offload-new-driver -ccc-print-bindings -nogpulib -nogpuinc \
-// RUN:        --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o %t %s 2>&1 \
+// RUN:        --no-gpu-bundle-output --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o %t %s 2>&1 \
+// RUN: | FileCheck -check-prefix=MULTI-D-ONLY-NO-BUNDLE-O %s
+// MULTI-D-ONLY-NO-BUNDLE-O: error: cannot specify -o when generating multiple output files
+
+// RUN: %clang -### --target=x86_64-linux-gnu --offload-new-driver -ccc-print-bindings -nogpulib -nogpuinc \
+// RUN:        --gpu-bundle-output --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o a.out %s 2>&1 \
 // RUN: | FileCheck -check-prefix=MULTI-D-ONLY-O %s
-// MULTI-D-ONLY-O: error: cannot specify -o when generating multiple output files
+//      MULTI-D-ONLY-O: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[GFX908_OBJ:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX908_OBJ]]"], output: "[[GFX908:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[GFX90A_OBJ:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX90A_OBJ]]"], output: "[[GFX90A:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX908]]", "[[GFX90A]]"], output: "a.out"
 
 //
 // Check to ensure that we can use '-fsyntax-only' for HIP output with the new

From 6d4aa9d70e4808498584cc61a295c0b93310196d Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Mon, 11 Mar 2024 12:20:25 -0700
Subject: [PATCH 30/95] [BOLT][DWWARF] Fix foreign TU index with local TUs
 (#84594)

The foreign TU list immediately follows the local TU list and they both
use the same index, so that if there are N local TU entries, the index
for the first foreign TU is N.

Changed so that the size of local TU is accounted for when setting
foreign TU index.
---
 bolt/lib/Core/DebugNames.cpp                  |   7 +-
 .../dwarf5-debug-names-ftu-ltu-mix-helper.s   | 314 +++++++++++
 .../dwarf5-debug-names-ftu-ltu-mix-helper1.s  | 315 +++++++++++
 .../dwarf5-df-debug-names-ftu-ltu-mix-main.s  | 505 ++++++++++++++++++
 ...warf5-df-main-debug-names-ftu-ltu-mix.test |  56 ++
 5 files changed, 1196 insertions(+), 1 deletion(-)
 create mode 100644 bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s
 create mode 100644 bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test

diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 1a7792afbbd94..384e63695dfdc 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -345,8 +345,13 @@ void DWARF5AcceleratorTable::finalize() {
 std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
 DWARF5AcceleratorTable::getIndexForEntry(
     const BOLTDWARF5AccelTableData &Value) const {
+  // The foreign TU list immediately follows the local TU list and they both
+  // use the same index, so that if there are N local TU entries, the index for
+  // the first foreign TU is N.
   if (Value.isTU())
-    return {{Value.getUnitID(), {dwarf::DW_IDX_type_unit, TUIndexForm}}};
+    return {{(Value.getSecondUnitID() ? (unsigned)LocalTUList.size() : 0) +
+                 Value.getUnitID(),
+             {dwarf::DW_IDX_type_unit, TUIndexForm}}};
   if (CUList.size() > 1)
     return {{Value.getUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
   return std::nullopt;
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s
new file mode 100644
index 0000000000000..68eee45ec9833
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s
@@ -0,0 +1,314 @@
+# struct AMono {
+#   int x;
+# };
+#
+# AMono globalMono;
+# # clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -o
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "/home" "helper.cpp" md5 0x3c0ac73d7b074961c6e8202230a76228
+	.section	.debug_info,"G",@progbits,6412503741467814911,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	6412503741467814911             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x20 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	4                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.type	globalMono,@object              # @globalMono
+	.bss
+	.globl	globalMono
+	.p2align	2, 0x0
+globalMono:
+	.zero	4
+	.size	globalMono, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	6                               # Abbrev [6] 0x1e:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x29:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	6412503741467814911             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	32                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=104
+.Linfo_string2:
+	.asciz	"/home" # string offset=115
+.Linfo_string3:
+	.asciz	"globalMono"                    # string offset=153
+.Linfo_string4:
+	.asciz	"AMono"                         # string offset=164
+.Linfo_string5:
+	.asciz	"x"                             # string offset=170
+.Linfo_string6:
+	.asciz	"int"                           # string offset=172
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string4
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalMono
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	193495088                       # Hash in Bucket 2
+	.long	253228319                       # Hash in Bucket 2
+	.long	-857151761                      # Hash in Bucket 2
+	.long	.Linfo_string6                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 2: AMono
+	.long	.Linfo_string3                  # String in Bucket 2: globalMono
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L1:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	51                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L2:
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	3                               # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: AMono
+.Lnames2:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	30                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalMono
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s
new file mode 100644
index 0000000000000..8b28c19dc87de
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s
@@ -0,0 +1,315 @@
+# struct BMono {
+#   int x;
+# };
+#
+# BMono globalMono1;
+# clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -o
+
+
+	.text
+	.file	"helper1.cpp"
+	.file	0 "/home" "helper1.cpp" md5 0x1fdaf911330b73495aed962bc02cfb3a
+	.section	.debug_info,"G",@progbits,5884764266900841573,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5884764266900841573             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x20 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	4                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.type	globalMono1,@object             # @globalMono1
+	.bss
+	.globl	globalMono1
+	.p2align	2, 0x0
+globalMono1:
+	.zero	4
+	.size	globalMono1, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	6                               # Abbrev [6] 0x1e:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x29:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	5884764266900841573             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	32                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper1.cpp"                   # string offset=104
+.Linfo_string2:
+	.asciz	"/home" # string offset=116
+.Linfo_string3:
+	.asciz	"globalMono1"                   # string offset=154
+.Linfo_string4:
+	.asciz	"BMono"                         # string offset=166
+.Linfo_string5:
+	.asciz	"x"                             # string offset=172
+.Linfo_string6:
+	.asciz	"int"                           # string offset=174
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string4
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalMono1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	193495088                       # Hash in Bucket 2
+	.long	254414240                       # Hash in Bucket 2
+	.long	1778763008                      # Hash in Bucket 2
+	.long	.Linfo_string6                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 2: BMono
+	.long	.Linfo_string3                  # String in Bucket 2: globalMono1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L1:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	51                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L2:
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	3                               # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: BMono
+.Lnames2:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	30                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalMono1
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s
new file mode 100644
index 0000000000000..69f6c5a5376a0
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s
@@ -0,0 +1,505 @@
+# struct ASplit {
+#   int x;
+# };
+#
+# ASplit globalSplit;
+# int main() {
+#   return 0;
+# }
+# clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -gsplit-dwarf -fdebug-compilation-dir='.'
+
+	.text
+	.file	"main.cpp"
+	.file	0 "." "main.cpp" md5 0xbb74a3c2960dafa324547ebbd87d13ea
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-8602855756067469281            # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x1e DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	49                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x31:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 6 0                           # main.cpp:6:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 7 3 prologue_end              # main.cpp:7:3
+	xorl	%eax, %eax
+	.loc	0 7 3 epilogue_begin is_stmt 0  # main.cpp:7:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	globalSplit,@object             # @globalSplit
+	.bss
+	.globl	globalSplit
+	.p2align	2, 0x0
+globalSplit:
+	.zero	4
+	.size	globalSplit, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5806847994123082226
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"ASplit"                        # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=9
+.Lskel_string3:
+	.asciz	"globalSplit"                   # string offset=13
+.Lskel_string4:
+	.asciz	"main"                          # string offset=25
+.Lskel_string5:
+	.asciz	"main.dwo"                      # string offset=30
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string5
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	40                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"globalSplit"                   # string offset=0
+.Linfo_string1:
+	.asciz	"."                             # string offset=12
+.Linfo_string2:
+	.asciz	"main.dwo"                      # string offset=14
+.Linfo_string3:
+	.asciz	"x"                             # string offset=23
+.Linfo_string4:
+	.asciz	"int"                           # string offset=25
+.Linfo_string5:
+	.asciz	"ASplit"                        # string offset=29
+.Linfo_string6:
+	.asciz	"main"                          # string offset=36
+.Linfo_string7:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=41
+.Linfo_string8:
+	.asciz	"main.cpp"                      # string offset=145
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	12
+	.long	14
+	.long	23
+	.long	25
+	.long	29
+	.long	36
+	.long	41
+	.long	145
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5806847994123082226
+	.byte	5                               # Abbrev [5] 0x14:0x2e DW_TAG_compile_unit
+	.byte	7                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	8                               # DW_AT_name
+	.byte	2                               # DW_AT_dwo_name
+	.byte	6                               # Abbrev [6] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x25:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-8602855756067469281            # DW_AT_signature
+	.byte	8                               # Abbrev [8] 0x2e:0xf DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	61                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	4                               # Abbrev [4] 0x3d:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	1
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	1
+	.ascii	"main.cpp"
+	.byte	0
+	.byte	0
+	.byte	0xbb, 0x74, 0xa3, 0xc2
+	.byte	0x96, 0x0d, 0xaf, 0xa3
+	.byte	0x24, 0x54, 0x7e, 0xbb
+	.byte	0xd8, 0x7d, 0x13, 0xea
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalSplit
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	1                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	-8602855756067469281            # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	1785912162                      # Hash in Bucket 2
+	.long	2090499946                      # Hash in Bucket 2
+	.long	-226250862                      # Hash in Bucket 2
+	.long	.Lskel_string2                  # String in Bucket 0: int
+	.long	.Lskel_string3                  # String in Bucket 2: globalSplit
+	.long	.Lskel_string4                  # String in Bucket 2: main
+	.long	.Lskel_string1                  # String in Bucket 2: ASplit
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L5:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	49                              # DW_IDX_die_offset
+.L1:                                    # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	61                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames2:
+.L0:
+	.byte	3                               # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalSplit
+.Lnames3:
+.L2:
+	.byte	4                               # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames0:
+.L4:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	6                               # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: ASplit
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
new file mode 100644
index 0000000000000..8a8a4b118b8c0
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
@@ -0,0 +1,56 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s -o helper.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s -o helper1.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o helper1.o -o main.exe -fno-pic -no-pie
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-names main.exe.bolt | FileCheck -check-prefix=BOLT %s
+
+;; Tests BOLT correctly sets foreign TU Index when there are local TUs.
+
+; BOLT:        Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: {{.+}}
+; BOLT-NEXT:     CU[1]: {{.+}}
+; BOLT-NEXT:     CU[2]: {{.+}}
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Local Type Unit offsets [
+; BOLT-NEXT:     LocalTU[0]: {{.+}}
+; BOLT-NEXT:     LocalTU[1]: {{.+}}
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: 0x889c84450dac881f
+; BOLT-NEXT:   ]
+; BOLT:        Name 3 {
+; BOLT-NEXT:     Hash: 0x6A05C500
+; BOLT-NEXT:     String: {{.+}} "globalMono1"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x5
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: 0x02
+; BOLT-NEXT:       DW_IDX_die_offset: 0x0000001e
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }
+; BOLT:        Name 6 {
+; BOLT-NEXT:     Hash: 0xF283AF92
+; BOLT-NEXT:     String: {{.+}} "ASplit"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x7
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: 0x02
+; BOLT-NEXT:       DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:       DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }
+; BOLT:        Name 7 {
+; BOLT-NEXT:     Hash: 0xF17F51F
+; BOLT-NEXT:     String: {{.+}} "AMono"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x4
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: 0x00
+; BOLT-NEXT:       DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }

From 6aef8dfe440c8234ce491dabb111a55b89754b4e Mon Sep 17 00:00:00 2001
From: amilendra <amilendra.kodithuwakku@arm.com>
Date: Mon, 11 Mar 2024 19:20:47 +0000
Subject: [PATCH 31/95] [libcxx] Update 128-bit-atomics feature test (#83841)

The `128-bit-atomics` libcxx feature is incorrectly named because tests
that are Xfailed with it is really using `int[128]`. Additionally,
because toolchain support for that feature is determined based on a much
smaller size (`char[16]`), tests would execute incorrectly without
required toolchain support.

So, rename `128-bit-atomics` as `1024-bit-atomics`, and use an
appropriate type to check for the presence of the feature.
---
 libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp       | 2 +-
 .../atomic_compare_exchange_strong.pass.cpp                   | 2 +-
 .../atomic_compare_exchange_strong_explicit.pass.cpp          | 2 +-
 .../atomic_compare_exchange_weak.pass.cpp                     | 2 +-
 .../atomic_compare_exchange_weak_explicit.pass.cpp            | 2 +-
 .../atomics.types.operations.req/atomic_exchange.pass.cpp     | 2 +-
 .../atomic_exchange_explicit.pass.cpp                         | 2 +-
 .../atomics.types.operations.req/atomic_init.pass.cpp         | 2 +-
 .../atomics.types.operations.req/atomic_is_lock_free.pass.cpp | 2 +-
 .../atomics.types.operations.req/atomic_load.pass.cpp         | 2 +-
 .../atomic_load_explicit.pass.cpp                             | 2 +-
 .../atomics.types.operations.req/atomic_store.pass.cpp        | 2 +-
 .../atomic_store_explicit.pass.cpp                            | 2 +-
 .../atomics.types.operations.wait/atomic_notify_all.pass.cpp  | 2 +-
 .../atomics.types.operations.wait/atomic_notify_one.pass.cpp  | 2 +-
 .../atomics.types.operations.wait/atomic_wait.pass.cpp        | 2 +-
 .../atomic_wait_explicit.pass.cpp                             | 2 +-
 libcxx/utils/libcxx/test/features.py                          | 4 ++--
 18 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp b/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
index e5cafde467603..5990fc411e504 100644
--- a/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
+++ b/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: c++03
-// REQUIRES: has-128-bit-atomics
+// REQUIRES: has-1024-bit-atomics
 // ADDITIONAL_COMPILE_FLAGS: -Wno-psabi
 // ... since C++20 std::__atomic_base initializes, so we get a warning about an
 // ABI change for vector variants since the constructor code for that is
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
index 1f0f61ed3e6ea..73c74fc6589f9 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
index 0b6fcacb3d66d..8d7803a15cd55 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
index 5de2f519ea435..6c1aa06bd2619 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
index fc0ad8a10acd1..b00940684907a 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
index 31cd316e023a3..6ebe64087fa0a 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
index 834a811c64342..3a505c8aa1569 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
index 4eced1d2b7f37..88775cf32d56f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
index 1a3b8393d8f9f..11a6b002a7865 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
index 5bb2bb2b614f9..36f1ee3ba370f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
index ecb27a261eb65..476e268c971c1 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
index 25a845e9e1f8f..94c570fb5d962 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
index d22657237327f..6d1acebe615f5 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 93ed607d413b2..2b9f34b731f87 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
index ad48ef1441f47..dfa781c566009 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
index 449e50fa12b5f..38142b336e72c 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
index a6ee4fc632797..2db95a0b67a7f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 3f0dc0c50a0d0..4fd8798b794a1 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -171,12 +171,12 @@ def _getAndroidDeviceApi(cfg):
         ),
     ),
     Feature(
-        name="has-128-bit-atomics",
+        name="has-1024-bit-atomics",
         when=lambda cfg: sourceBuilds(
             cfg,
             """
             #include <atomic>
-            struct Large { char storage[128/8]; };
+            struct Large { int storage[1024/8]; };
             std::atomic<Large> x;
             int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
           """,

From 18f49cf2e69676497cccc81ad5f5296fedcde338 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 11 Mar 2024 15:21:38 -0400
Subject: [PATCH 32/95] [libc++] Remove XFAIL for SIMD in optimized build
 (#84767)

It seems that updating the compiler in the CI resolved the issue, which
causes the test to be XPASSing now.

Fixes #74327
---
 .../simd/simd.class/simd_ctor_conversion.pass.cpp             | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
index 5920d62e0e5a6..7ce4bed9c7db8 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
@@ -9,10 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // XFAIL: target=powerpc{{.*}}le-unknown-linux-gnu
 
-// TODO: This test makes incorrect assumptions about floating point conversions.
-//       See https://github.com/llvm/llvm-project/issues/74327.
-// XFAIL: optimization=speed
-
 // <experimental/simd>
 //
 // [simd.class]

From d2e57c5c36d9b084f804cfd96a47472e23d05cac Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 11 Mar 2024 15:22:51 -0400
Subject: [PATCH 33/95] [libc++] Re-enable the clang_modules_include test for
 Objective-C++ (#66801)

This reverts commit aa60b2687, which was a temporary workaround.
The underlying issue was fixed in Clang via c2c840bd92cf.

This was originally https://reviews.llvm.org/D158694.
---
 libcxx/test/libcxx/clang_modules_include.gen.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index e3593eefad2fe..61a9258237640 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -47,11 +47,8 @@
 #include <{header}>
 """)
 
-# TODO: Remove the UNSUPPORTED{BLOCKLIT}: clang-modules-build once issues with this test have been figured out.
 print(f"""\
 //--- __std_clang_module.compile.pass.mm
-// UNSUPPORTED{BLOCKLIT}: clang-modules-build
-
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
 // REQUIRES{BLOCKLIT}: clang-modules-build

From 42ee286e51260286c59fa1186d0e56ad0f446054 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 11 Mar 2024 15:25:28 -0400
Subject: [PATCH 34/95] Fixing test from
 8467457afc61d70e881c9817ace26356ef757733

The clangd test was testing the previous diagnostic logic and now it's
testing with the new warning flag.
---
 clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 2f6dd0611b662..25d2f03e0b366 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -544,7 +544,7 @@ TEST(DiagnosticTest, RespectsDiagnosticConfig) {
                   Diag(Main.range("ret"),
                        "void function 'x' should not return a value")));
   Config Cfg;
-  Cfg.Diagnostics.Suppress.insert("return-type");
+  Cfg.Diagnostics.Suppress.insert("return-mismatch");
   WithContextValue WithCfg(Config::Key, std::move(Cfg));
   EXPECT_THAT(TU.build().getDiagnostics(),
               ElementsAre(Diag(Main.range(),

From a70d7298818aae94ee62cd50c3ba195aaa10acb1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 11 Mar 2024 14:30:30 -0500
Subject: [PATCH 35/95] [flang] Avoid left shifts of negative signed values
 (#84786)

Shifting left a signed, negative value is an undefined behavior in C++.

This was detected by the undefined behavior sanitizer.
---
 flang/include/flang/Evaluate/integer.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index 977d35c7eecf4..31768c21daae6 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -150,7 +150,10 @@ class Integer {
           }
         }
       } else {
-        INT signExtension{-(n < 0)};
+        // Avoid left shifts of negative signed values (that's an undefined
+        // behavior in C++).
+        auto signExtension{std::make_unsigned_t<INT>(n < 0)};
+        signExtension = ~signExtension + 1;
         static_assert(nBits >= partBits);
         if constexpr (nBits > partBits) {
           signExtension <<= nBits - partBits;
@@ -474,7 +477,12 @@ class Integer {
     SINT n = ToUInt<UINT>();
     constexpr std::size_t maxBits{CHAR_BIT * sizeof n};
     if constexpr (bits < maxBits) {
-      n |= -(n >> (bits - 1)) << bits;
+      // Avoid left shifts of negative signed values (that's an undefined
+      // behavior in C++).
+      auto u{std::make_unsigned_t<SINT>(ToUInt())};
+      u = (u >> (bits - 1)) << (bits - 1); // Get the sign bit only.
+      u = ~u + 1; // Negate top bits if not 0.
+      n |= static_cast<SINT>(u);
     }
     return n;
   }

From 1def98d9f2eb2ae39e774369693e6f2f74551b7f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 11 Mar 2024 14:35:31 -0500
Subject: [PATCH 36/95] [flang] Avoid forming a reference from null pointer
 (#84787)

Doing so is an undefined behavior.

This was detected by the undefined behavior sanitizer.
---
 flang/lib/Parser/token-sequence.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp
index c5a630c471d16..799d13a423660 100644
--- a/flang/lib/Parser/token-sequence.cpp
+++ b/flang/lib/Parser/token-sequence.cpp
@@ -136,7 +136,10 @@ void TokenSequence::Put(
 }
 
 void TokenSequence::Put(const CharBlock &t, Provenance provenance) {
-  Put(&t[0], t.size(), provenance);
+  // Avoid t[0] if t is empty: it would create a reference to nullptr,
+  // which is UB.
+  const char *addr{t.size() ? &t[0] : nullptr};
+  Put(addr, t.size(), provenance);
 }
 
 void TokenSequence::Put(const std::string &s, Provenance provenance) {

From a25fa92d870a5cbb3eeccdc7458d1bc6834b695a Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 11 Mar 2024 15:39:05 -0400
Subject: [PATCH 37/95] [libc][stdbit] Add C tests for stdbit generic macros.
 (#84670)

Currently there is no tests for generic macros of generated `stdbit.h`
header in C, and it is easy to make typo mistakes as in
https://github.com/llvm/llvm-project/issues/84658. In this patch, we
add a simple test for them in C.
---
 libc/test/include/CMakeLists.txt  | 25 +++++++++
 libc/test/include/stdbit_stub.h   | 73 ++++++++++++++++++++++++++
 libc/test/include/stdbit_test.c   | 61 ++++++++++++++++++++++
 libc/test/include/stdbit_test.cpp | 85 +------------------------------
 4 files changed, 160 insertions(+), 84 deletions(-)
 create mode 100644 libc/test/include/stdbit_stub.h
 create mode 100644 libc/test/include/stdbit_test.c

diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index bf845c94170f9..d76ad442d36ce 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -22,16 +22,41 @@ if(LLVM_LIBC_FULL_BUILD AND libc.include.stdbit IN_LIST TARGET_PUBLIC_HEADERS)
     stdbit_test
     SUITE
       libc_include_tests
+    HDRS
+      stdbit_stub.h
     SRCS
       stdbit_test.cpp
     DEPENDS
       libc.include.llvm-libc-macros.stdbit_macros
+      libc.include.llvm_libc_common_h
       libc.include.stdbit
       # Intentionally do not depend on libc.src.stdbit.*. The include test is
       # simply testing the macros provided by stdbit.h, not the implementation
       # of the underlying functions which the type generic macros may dispatch
       # to.
   )
+  add_libc_test(
+    stdbit_c_test
+    UNIT_TEST_ONLY
+    SUITE
+      libc_include_tests
+    HDRS
+      stdbit_stub.h
+    SRCS
+      stdbit_test.c
+    COMPILE_OPTIONS
+      -Wall
+      -Werror
+    DEPENDS
+      libc.include.llvm-libc-macros.stdbit_macros
+      libc.include.llvm_libc_common_h
+      libc.include.stdbit
+      libc.src.assert.__assert_fail
+      # Intentionally do not depend on libc.src.stdbit.*. The include test is
+      # simply testing the macros provided by stdbit.h, not the implementation
+      # of the underlying functions which the type generic macros may dispatch
+      # to.
+  )
 endif()
 
 add_libc_test(
diff --git a/libc/test/include/stdbit_stub.h b/libc/test/include/stdbit_stub.h
new file mode 100644
index 0000000000000..65b1ca3b2c297
--- /dev/null
+++ b/libc/test/include/stdbit_stub.h
@@ -0,0 +1,73 @@
+//===-- Utilities for testing stdbit --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*
+ * Declare these BEFORE including stdbit-macros.h so that this test may still be
+ * run even if a given target doesn't yet have these individual entrypoints
+ * enabled.
+ */
+
+#include "include/__llvm-libc-common.h"
+
+#include <stdbool.h> // bool in C
+
+#define STDBIT_STUB_FUNCTION(FUNC_NAME, LEADING_VAL)                           \
+  unsigned FUNC_NAME##_uc(unsigned char x) __NOEXCEPT {                        \
+    return LEADING_VAL##AU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_us(unsigned short x) __NOEXCEPT {                       \
+    return LEADING_VAL##BU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ui(unsigned int x) __NOEXCEPT {                         \
+    return LEADING_VAL##CU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ul(unsigned long x) __NOEXCEPT {                        \
+    return LEADING_VAL##DU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ull(unsigned long long x) __NOEXCEPT {                  \
+    return LEADING_VAL##EU;                                                    \
+  }
+
+__BEGIN_C_DECLS
+
+STDBIT_STUB_FUNCTION(stdc_leading_zeros, 0xA)
+STDBIT_STUB_FUNCTION(stdc_leading_ones, 0xB)
+STDBIT_STUB_FUNCTION(stdc_trailing_zeros, 0xC)
+STDBIT_STUB_FUNCTION(stdc_trailing_ones, 0xD)
+STDBIT_STUB_FUNCTION(stdc_first_leading_zero, 0xE)
+STDBIT_STUB_FUNCTION(stdc_first_leading_one, 0xF)
+STDBIT_STUB_FUNCTION(stdc_first_trailing_zero, 0x0)
+STDBIT_STUB_FUNCTION(stdc_first_trailing_one, 0x1)
+STDBIT_STUB_FUNCTION(stdc_count_zeros, 0x2)
+STDBIT_STUB_FUNCTION(stdc_count_ones, 0x3)
+
+bool stdc_has_single_bit_uc(unsigned char x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_us(unsigned short x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ui(unsigned x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ul(unsigned long x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ull(unsigned long long x) __NOEXCEPT { return false; }
+
+STDBIT_STUB_FUNCTION(stdc_bit_width, 0x4)
+
+unsigned char stdc_bit_floor_uc(unsigned char x) __NOEXCEPT { return 0x5AU; }
+unsigned short stdc_bit_floor_us(unsigned short x) __NOEXCEPT { return 0x5BU; }
+unsigned stdc_bit_floor_ui(unsigned x) __NOEXCEPT { return 0x5CU; }
+unsigned long stdc_bit_floor_ul(unsigned long x) __NOEXCEPT { return 0x5DUL; }
+unsigned long long stdc_bit_floor_ull(unsigned long long x) __NOEXCEPT {
+  return 0x5EULL;
+}
+
+unsigned char stdc_bit_ceil_uc(unsigned char x) __NOEXCEPT { return 0x6AU; }
+unsigned short stdc_bit_ceil_us(unsigned short x) __NOEXCEPT { return 0x6BU; }
+unsigned stdc_bit_ceil_ui(unsigned x) __NOEXCEPT { return 0x6CU; }
+unsigned long stdc_bit_ceil_ul(unsigned long x) __NOEXCEPT { return 0x6DUL; }
+unsigned long long stdc_bit_ceil_ull(unsigned long long x) __NOEXCEPT {
+  return 0x6EULL;
+}
+
+__END_C_DECLS
diff --git a/libc/test/include/stdbit_test.c b/libc/test/include/stdbit_test.c
new file mode 100644
index 0000000000000..e278e9a7374e0
--- /dev/null
+++ b/libc/test/include/stdbit_test.c
@@ -0,0 +1,61 @@
+//===-- Unittests for stdbit ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*
+ * The intent of this test is validate that:
+ * 1. We provide the definition of the various type generic macros of stdbit.h
+ * (the macros are transitively included from stdbit-macros.h by stdbit.h).
+ * 2. It dispatches to the correct underlying function.
+ * Because unit tests build without public packaging, the object files produced
+ * do not contain non-namespaced symbols.
+ */
+
+/*
+ * Declare these BEFORE including stdbit-macros.h so that this test may still be
+ * run even if a given target doesn't yet have these individual entrypoints
+ * enabled.
+ */
+#include "stdbit_stub.h"
+
+#include "include/llvm-libc-macros/stdbit-macros.h"
+
+#include <assert.h>
+
+#define CHECK_FUNCTION(FUNC_NAME, VAL)                                         \
+  do {                                                                         \
+    assert(FUNC_NAME((unsigned char)0U) == VAL##AU);                           \
+    assert(FUNC_NAME((unsigned short)0U) == VAL##BU);                          \
+    assert(FUNC_NAME(0U) == VAL##CU);                                          \
+    assert(FUNC_NAME(0UL) == VAL##DU);                                         \
+    assert(FUNC_NAME(0ULL) == VAL##EU);                                        \
+  } while (0)
+
+int main(void) {
+  CHECK_FUNCTION(stdc_leading_zeros, 0xA);
+  CHECK_FUNCTION(stdc_leading_ones, 0xB);
+  CHECK_FUNCTION(stdc_trailing_zeros, 0xC);
+  CHECK_FUNCTION(stdc_trailing_ones, 0xD);
+  CHECK_FUNCTION(stdc_first_leading_zero, 0xE);
+  CHECK_FUNCTION(stdc_first_leading_one, 0xF);
+  CHECK_FUNCTION(stdc_first_trailing_zero, 0x0);
+  CHECK_FUNCTION(stdc_first_trailing_one, 0x1);
+  CHECK_FUNCTION(stdc_count_zeros, 0x2);
+  CHECK_FUNCTION(stdc_count_ones, 0x3);
+
+  assert(!stdc_has_single_bit((unsigned char)1U));
+  assert(!stdc_has_single_bit((unsigned short)1U));
+  assert(!stdc_has_single_bit(1U));
+  assert(!stdc_has_single_bit(1UL));
+  assert(!stdc_has_single_bit(1ULL));
+
+  CHECK_FUNCTION(stdc_bit_width, 0x4);
+  CHECK_FUNCTION(stdc_bit_floor, 0x5);
+  CHECK_FUNCTION(stdc_bit_ceil, 0x6);
+
+  return 0;
+}
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 6c12665c4454d..f3227eb86959e 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -22,90 +22,7 @@
  * run even if a given target doesn't yet have these individual entrypoints
  * enabled.
  */
-extern "C" {
-unsigned stdc_leading_zeros_uc(unsigned char) noexcept { return 0xAAU; }
-unsigned stdc_leading_zeros_us(unsigned short) noexcept { return 0xABU; }
-unsigned stdc_leading_zeros_ui(unsigned) noexcept { return 0xACU; }
-unsigned stdc_leading_zeros_ul(unsigned long) noexcept { return 0xADU; }
-unsigned stdc_leading_zeros_ull(unsigned long long) noexcept { return 0xAEU; }
-unsigned stdc_leading_ones_uc(unsigned char) noexcept { return 0xBAU; }
-unsigned stdc_leading_ones_us(unsigned short) noexcept { return 0xBBU; }
-unsigned stdc_leading_ones_ui(unsigned) noexcept { return 0xBCU; }
-unsigned stdc_leading_ones_ul(unsigned long) noexcept { return 0xBDU; }
-unsigned stdc_leading_ones_ull(unsigned long long) noexcept { return 0xBEU; }
-unsigned stdc_trailing_zeros_uc(unsigned char) noexcept { return 0xCAU; }
-unsigned stdc_trailing_zeros_us(unsigned short) noexcept { return 0xCBU; }
-unsigned stdc_trailing_zeros_ui(unsigned) noexcept { return 0xCCU; }
-unsigned stdc_trailing_zeros_ul(unsigned long) noexcept { return 0xCDU; }
-unsigned stdc_trailing_zeros_ull(unsigned long long) noexcept { return 0xCEU; }
-unsigned stdc_trailing_ones_uc(unsigned char) noexcept { return 0xDAU; }
-unsigned stdc_trailing_ones_us(unsigned short) noexcept { return 0xDBU; }
-unsigned stdc_trailing_ones_ui(unsigned) noexcept { return 0xDCU; }
-unsigned stdc_trailing_ones_ul(unsigned long) noexcept { return 0xDDU; }
-unsigned stdc_trailing_ones_ull(unsigned long long) noexcept { return 0xDEU; }
-unsigned stdc_first_leading_zero_uc(unsigned char) noexcept { return 0xEAU; }
-unsigned stdc_first_leading_zero_us(unsigned short) noexcept { return 0xEBU; }
-unsigned stdc_first_leading_zero_ui(unsigned) noexcept { return 0xECU; }
-unsigned stdc_first_leading_zero_ul(unsigned long) noexcept { return 0xEDU; }
-unsigned stdc_first_leading_zero_ull(unsigned long long) noexcept {
-  return 0xEEU;
-}
-unsigned stdc_first_leading_one_uc(unsigned char) noexcept { return 0xFAU; }
-unsigned stdc_first_leading_one_us(unsigned short) noexcept { return 0xFBU; }
-unsigned stdc_first_leading_one_ui(unsigned) noexcept { return 0xFCU; }
-unsigned stdc_first_leading_one_ul(unsigned long) noexcept { return 0xFDU; }
-unsigned stdc_first_leading_one_ull(unsigned long long) noexcept {
-  return 0xFEU;
-}
-unsigned stdc_first_trailing_zero_uc(unsigned char) noexcept { return 0x0AU; }
-unsigned stdc_first_trailing_zero_us(unsigned short) noexcept { return 0x0BU; }
-unsigned stdc_first_trailing_zero_ui(unsigned) noexcept { return 0x0CU; }
-unsigned stdc_first_trailing_zero_ul(unsigned long) noexcept { return 0x0DU; }
-unsigned stdc_first_trailing_zero_ull(unsigned long long) noexcept {
-  return 0x0EU;
-}
-unsigned stdc_first_trailing_one_uc(unsigned char) noexcept { return 0x1AU; }
-unsigned stdc_first_trailing_one_us(unsigned short) noexcept { return 0x1BU; }
-unsigned stdc_first_trailing_one_ui(unsigned) noexcept { return 0x1CU; }
-unsigned stdc_first_trailing_one_ul(unsigned long) noexcept { return 0x1DU; }
-unsigned stdc_first_trailing_one_ull(unsigned long long) noexcept {
-  return 0x1EU;
-}
-unsigned stdc_count_zeros_uc(unsigned char) noexcept { return 0x2AU; }
-unsigned stdc_count_zeros_us(unsigned short) noexcept { return 0x2BU; }
-unsigned stdc_count_zeros_ui(unsigned) noexcept { return 0x2CU; }
-unsigned stdc_count_zeros_ul(unsigned long) noexcept { return 0x2DU; }
-unsigned stdc_count_zeros_ull(unsigned long long) noexcept { return 0x2EU; }
-unsigned stdc_count_ones_uc(unsigned char) noexcept { return 0x3AU; }
-unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
-unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
-unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
-unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3EU; }
-bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
-bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
-bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
-bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
-bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
-unsigned stdc_bit_width_uc(unsigned char) noexcept { return 0x4AU; }
-unsigned stdc_bit_width_us(unsigned short) noexcept { return 0x4BU; }
-unsigned stdc_bit_width_ui(unsigned) noexcept { return 0x4CU; }
-unsigned stdc_bit_width_ul(unsigned long) noexcept { return 0x4DU; }
-unsigned stdc_bit_width_ull(unsigned long long) noexcept { return 0x4EU; }
-unsigned char stdc_bit_floor_uc(unsigned char) noexcept { return 0x5AU; }
-unsigned short stdc_bit_floor_us(unsigned short) noexcept { return 0x5BU; }
-unsigned stdc_bit_floor_ui(unsigned) noexcept { return 0x5CU; }
-unsigned long stdc_bit_floor_ul(unsigned long) noexcept { return 0x5DU; }
-unsigned long long stdc_bit_floor_ull(unsigned long long) noexcept {
-  return 0x5EU;
-}
-unsigned char stdc_bit_ceil_uc(unsigned char) noexcept { return 0x6AU; }
-unsigned short stdc_bit_ceil_us(unsigned short) noexcept { return 0x6BU; }
-unsigned stdc_bit_ceil_ui(unsigned) noexcept { return 0x6CU; }
-unsigned long stdc_bit_ceil_ul(unsigned long) noexcept { return 0x6DU; }
-unsigned long long stdc_bit_ceil_ull(unsigned long long) noexcept {
-  return 0x6EU;
-}
-}
+#include "stdbit_stub.h"
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
 

From 884b051a42896e94dc6032013e10483d84910f27 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 11 Mar 2024 09:55:44 -0700
Subject: [PATCH 38/95] Recommit "[TypePromotion] Support positive addition
 amounts in isSafeWrap. (#81690)"

With special case with Add constant is 0.

Original message:
We can support these by changing the sext promotion to -zext(-C) and
replacing a sgt check with ugt. Reframing the logic in terms of how the
unsigned range are affected. More comments in the patch.

The new cases check isLegalAddImmediate to avoid some
regressions in lit tests.
---
 llvm/lib/CodeGen/TypePromotion.cpp            | 129 ++++++++++--------
 llvm/test/CodeGen/AArch64/and-mask-removal.ll |  18 +--
 .../AArch64/signed-truncation-check.ll        |   2 +-
 .../CodeGen/AArch64/typepromotion-overflow.ll |   5 +-
 .../CodeGen/RISCV/typepromotion-overflow.ll   |   5 +-
 .../Transforms/TypePromotion/ARM/icmps.ll     |   5 +-
 .../Transforms/TypePromotion/ARM/wrapping.ll  |  10 +-
 7 files changed, 92 insertions(+), 82 deletions(-)

diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 48ad8de778010..b0830308908d6 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -136,6 +136,7 @@ class IRPromoter {
 
 class TypePromotionImpl {
   unsigned TypeSize = 0;
+  const TargetLowering *TLI = nullptr;
   LLVMContext *Ctx = nullptr;
   unsigned RegisterBitWidth = 0;
   SmallPtrSet<Value *, 16> AllVisited;
@@ -272,64 +273,58 @@ bool TypePromotionImpl::isSink(Value *V) {
 
 /// Return whether this instruction can safely wrap.
 bool TypePromotionImpl::isSafeWrap(Instruction *I) {
-  // We can support a potentially wrapping instruction (I) if:
+  // We can support a potentially wrapping Add/Sub instruction (I) if:
   // - It is only used by an unsigned icmp.
   // - The icmp uses a constant.
-  // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
-  //   around zero to become a larger number than before.
   // - The wrapping instruction (I) also uses a constant.
   //
-  // We can then use the two constants to calculate whether the result would
-  // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
-  // just underflows the range, the icmp would give the same result whether the
-  // result has been truncated or not. We calculate this by:
-  // - Zero extending both constants, if needed, to RegisterBitWidth.
-  // - Take the absolute value of I's constant, adding this to the icmp const.
-  // - Check that this value is not out of range for small type. If it is, it
-  //   means that it has underflowed enough to wrap around the icmp constant.
+  // This a common pattern emitted to check if a value is within a range.
   //
   // For example:
   //
-  // %sub = sub i8 %a, 2
-  // %cmp = icmp ule i8 %sub, 254
+  // %sub = sub i8 %a, C1
+  // %cmp = icmp ule i8 %sub, C2
+  //
+  // or
+  //
+  // %add = add i8 %a, C1
+  // %cmp = icmp ule i8 %add, C2.
   //
-  // If %a = 0, %sub = -2 == FE == 254
-  // But if this is evalulated as a i32
-  // %sub = -2 == FF FF FF FE == 4294967294
-  // So the unsigned compares (i8 and i32) would not yield the same result.
+  // We will treat an add as though it were a subtract by -C1. To promote
+  // the Add/Sub we will zero extend the LHS and the subtracted amount. For Add,
+  // this means we need to negate the constant, zero extend to RegisterBitWidth,
+  // and negate in the larger type.
   //
-  // Another way to look at it is:
-  // %a - 2 <= 254
-  // %a + 2 <= 254 + 2
-  // %a <= 256
-  // And we can't represent 256 in the i8 format, so we don't support it.
+  // This will produce a value in the range [-zext(C1), zext(X)-zext(C1)] where
+  // C1 is the subtracted amount. This is either a small unsigned number or a
+  // large unsigned number in the promoted type.
   //
-  // Whereas:
+  // Now we need to correct the compare constant C2. Values >= C1 in the
+  // original add result range have been remapped to large values in the
+  // promoted range. If the compare constant fell into this range we need to
+  // remap it as well. We can do this as -(zext(-C2)).
   //
-  // %sub i8 %a, 1
+  // For example:
+  //
+  // %sub = sub i8 %a, 2
   // %cmp = icmp ule i8 %sub, 254
   //
-  // If %a = 0, %sub = -1 == FF == 255
-  // As i32:
-  // %sub = -1 == FF FF FF FF == 4294967295
+  // becomes
   //
-  // In this case, the unsigned compare results would be the same and this
-  // would also be true for ult, uge and ugt:
-  // - (255 < 254) == (0xFFFFFFFF < 254) == false
-  // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
-  // - (255 > 254) == (0xFFFFFFFF > 254) == true
-  // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
+  // %zext = zext %a to i32
+  // %sub = sub i32 %zext, 2
+  // %cmp = icmp ule i32 %sub, 4294967294
   //
-  // To demonstrate why we can't handle increasing values:
+  // Another example:
   //
-  // %add = add i8 %a, 2
-  // %cmp = icmp ult i8 %add, 127
+  // %sub = sub i8 %a, 1
+  // %cmp = icmp ule i8 %sub, 254
   //
-  // If %a = 254, %add = 256 == (i8 1)
-  // As i32:
-  // %add = 256
+  // becomes
   //
-  // (1 < 127) != (256 < 127)
+  // %zext = zext %a to i32
+  // %sub = sub i32 %zext, 1
+  // %cmp = icmp ule i32 %sub, 254
 
   unsigned Opc = I->getOpcode();
   if (Opc != Instruction::Add && Opc != Instruction::Sub)
@@ -356,21 +351,29 @@ bool TypePromotionImpl::isSafeWrap(Instruction *I) {
   APInt OverflowConst = cast<ConstantInt>(I->getOperand(1))->getValue();
   if (Opc == Instruction::Sub)
     OverflowConst = -OverflowConst;
-  if (!OverflowConst.isNonPositive())
-    return false;
+
+  // If the constant is positive, we will end up filling the promoted bits with
+  // all 1s. Make sure that results in a cheap add constant.
+  if (!OverflowConst.isNonPositive()) {
+    // We don't have the true promoted width, just use 64 so we can create an
+    // int64_t for the isLegalAddImmediate call.
+    if (OverflowConst.getBitWidth() >= 64)
+      return false;
+
+    APInt NewConst = -((-OverflowConst).zext(64));
+    if (!TLI->isLegalAddImmediate(NewConst.getSExtValue()))
+      return false;
+  }
 
   SafeWrap.insert(I);
 
-  // Using C1 = OverflowConst and C2 = ICmpConst, we can either prove that:
-  //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
-  //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
-  if (OverflowConst.sgt(ICmpConst)) {
-    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+  if (OverflowConst == 0 || OverflowConst.ugt(ICmpConst)) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for "
                       << "const of " << *I << "\n");
     return true;
   }
 
-  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for "
                     << "const of " << *I << " and " << *CI << "\n");
   SafeWrap.insert(CI);
   return true;
@@ -487,18 +490,24 @@ void IRPromoter::PromoteTree() {
         continue;
 
       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
-        // For subtract, we don't need to sext the constant. We only put it in
+        // For subtract, we only need to zext the constant. We only put it in
         // SafeWrap because SafeWrap.size() is used elsewhere.
-        // For cmp, we need to sign extend a constant appearing in either
-        // operand. For add, we should only sign extend the RHS.
-        Constant *NewConst =
-            ConstantInt::get(Const->getContext(),
-                             (SafeWrap.contains(I) &&
-                              (I->getOpcode() == Instruction::ICmp || i == 1) &&
-                              I->getOpcode() != Instruction::Sub)
-                                 ? Const->getValue().sext(PromotedWidth)
-                                 : Const->getValue().zext(PromotedWidth));
-        I->setOperand(i, NewConst);
+        // For Add and ICmp we need to find how far the constant is from the
+        // top of its original unsigned range and place it the same distance
+        // from the top of its new unsigned range. We can do this by negating
+        // the constant, zero extending it, then negating in the new type.
+        APInt NewConst;
+        if (SafeWrap.contains(I)) {
+          if (I->getOpcode() == Instruction::ICmp)
+            NewConst = -((-Const->getValue()).zext(PromotedWidth));
+          else if (I->getOpcode() == Instruction::Add && i == 1)
+            NewConst = -((-Const->getValue()).zext(PromotedWidth));
+          else
+            NewConst = Const->getValue().zext(PromotedWidth);
+        } else
+          NewConst = Const->getValue().zext(PromotedWidth);
+
+        I->setOperand(i, ConstantInt::get(Const->getContext(), NewConst));
       } else if (isa<UndefValue>(Op))
         I->setOperand(i, ConstantInt::get(ExtTy, 0));
     }
@@ -917,7 +926,7 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
-  const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
+  TLI = SubtargetInfo->getTargetLowering();
   RegisterBitWidth =
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
   Ctx = &F.getParent()->getContext();
diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index 17ff015970168..a8a59f1591268 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -65,9 +65,8 @@ if.end:                                           ; preds = %if.then, %entry
 define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
 ; CHECK-LABEL: test8_0:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    add w8, w0, #74
-; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #236
+; CHECK-NEXT:    sub w8, w0, #182
+; CHECK-NEXT:    cmn w8, #20
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
@@ -508,16 +507,17 @@ define i64 @pr58109(i8 signext %0) {
 define i64 @pr58109b(i8 signext %0, i64 %a, i64 %b) {
 ; CHECK-SD-LABEL: pr58109b:
 ; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    add w8, w0, #1
-; CHECK-SD-NEXT:    tst w8, #0xfe
-; CHECK-SD-NEXT:    csel x0, x1, x2, eq
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    sub w8, w8, #255
+; CHECK-SD-NEXT:    cmn w8, #254
+; CHECK-SD-NEXT:    csel x0, x1, x2, lo
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: pr58109b:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    add w8, w0, #1
-; CHECK-GI-NEXT:    and w8, w8, #0xff
-; CHECK-GI-NEXT:    cmp w8, #2
+; CHECK-GI-NEXT:    mov w8, #-255 ; =0xffffff01
+; CHECK-GI-NEXT:    add w8, w8, w0, uxtb
+; CHECK-GI-NEXT:    cmn w8, #254
 ; CHECK-GI-NEXT:    csel x0, x1, x2, lo
 ; CHECK-GI-NEXT:    ret
   %2 = add i8 %0, 1
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index ab42e6463feee..bb4df6d8935b1 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -396,7 +396,7 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
 define i1 @add_ulecmp_bad_i16_i8(i16 %x) nounwind {
 ; CHECK-LABEL: add_ulecmp_bad_i16_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, -1 ; when we +1 it, it will wrap to 0
diff --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
index ccfbf456693d7..39edc03ced442 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
@@ -246,9 +246,8 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    add w8, w8, #8
-; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #252
+; CHECK-NEXT:    sub w8, w8, #248
+; CHECK-NEXT:    cmn w8, #4
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index 3740dc675949f..ec7e0ecce80ca 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -283,9 +283,8 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lbu a0, 0(a0)
-; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    andi a0, a0, 255
-; CHECK-NEXT:    sltiu a0, a0, 253
+; CHECK-NEXT:    addi a0, a0, -248
+; CHECK-NEXT:    sltiu a0, a0, -3
 ; CHECK-NEXT:    xori a0, a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
index 842aab121b96f..7e03d689fdc9e 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
@@ -4,8 +4,9 @@
 define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
 ; CHECK-LABEL: @test_ult_254_inc_imm(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], -255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], -2
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
index 377708cf71134..78c5e7323ceab 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
@@ -89,8 +89,9 @@ define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
 
 define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_add_positive_const_limit(
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], -255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -144,8 +145,9 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 
 define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_sub_negative_const_limit(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;

From 5feaef63c08b6fefb6b0eaff2270ccb14740cca2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 11 Mar 2024 19:40:43 +0000
Subject: [PATCH 39/95] [TBAA] Generate tbaa.struct single field with char tag
 for unions. (#84370)

At the moment,distinct fields for each union member are generated. When
copying a union, we don't know which union member is active, so there's
no benefit from recording the different fields. It can result in
converting tbaa.struct fields to incorrect tbaa nodes when extracting
fields.

PR: https://github.com/llvm/llvm-project/pull/84370
---
 clang/lib/CodeGen/CodeGenTBAA.cpp  | 8 ++++++++
 clang/test/CodeGen/tbaa-struct.cpp | 8 +++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 1f07205a5af22..a1e14c5f0a8c7 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -286,6 +286,14 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
   /* Things not handled yet include: C++ base classes, bitfields, */
 
   if (const RecordType *TTy = QTy->getAs<RecordType>()) {
+    if (TTy->isUnionType()) {
+      uint64_t Size = Context.getTypeSizeInChars(QTy).getQuantity();
+      llvm::MDNode *TBAAType = getChar();
+      llvm::MDNode *TBAATag = getAccessTagInfo(TBAAAccessInfo(TBAAType, Size));
+      Fields.push_back(
+          llvm::MDBuilder::TBAAStructField(BaseOffset, Size, TBAATag));
+      return true;
+    }
     const RecordDecl *RD = TTy->getDecl()->getDefinition();
     if (RD->hasFlexibleArrayMember())
       return false;
diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index 63e4097946448..9b4b7415142d9 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -191,7 +191,7 @@ void copy12(UnionMember2 *a1, UnionMember2 *a2) {
 // (offset, size) = (0,1) char; (4,2) short; (8,4) int; (12,1) char; (16,4) int; (20,4) int
 // CHECK-OLD: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
 // (offset, size) = (0,8) char; (0,2) char; (4,8) char
-// CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
+// CHECK-OLD: [[TS3]] = !{i64 0, i64 12, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
@@ -199,10 +199,8 @@ void copy12(UnionMember2 *a1, UnionMember2 *a2) {
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
 // CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]}
-// CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_DOUBLE]], i64 0, i64 4, [[TAG_FLOAT:!.+]], i64 8, i64 4, [[TAG_INT]]}
-// CHECK-OLD: [[TAG_FLOAT]]  = !{[[FLOAT:!.+]], [[FLOAT]], i64 0}
-// CHECK-OLD: [[FLOAT]] = !{!"float", [[CHAR]], i64 0}
-// CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_DOUBLE]], i64 8, i64 4, [[TAG_FLOAT:!.+]]}
+// CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_CHAR]], i64 8, i64 4, [[TAG_INT]]}
+// CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_CHAR]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}

From 94c988bcfdea596e5c9078be8ec28688eb0d96a3 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 11 Mar 2024 19:47:48 +0000
Subject: [PATCH 40/95] [NFC] Remove unused parameter from
 shouldAssumeDSOLocal()

---
 llvm/include/llvm/Target/TargetMachine.h               |  2 +-
 llvm/lib/CodeGen/GlobalMerge.cpp                       |  2 +-
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp       |  2 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp           |  5 ++---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp              |  3 +--
 llvm/lib/Target/ARM/ARMISelLowering.cpp                |  8 +++-----
 llvm/lib/Target/ARM/ARMSubtarget.cpp                   |  2 +-
 llvm/lib/Target/CSKY/CSKYISelLowering.cpp              |  8 +++-----
 llvm/lib/Target/Hexagon/HexagonISelLowering.cpp        |  2 +-
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp    | 10 ++++------
 llvm/lib/Target/M68k/M68kSubtarget.cpp                 |  6 +++---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp            |  7 +++----
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp               |  2 +-
 llvm/lib/Target/SystemZ/SystemZSubtarget.cpp           |  2 +-
 llvm/lib/Target/TargetMachine.cpp                      |  5 ++---
 llvm/lib/Target/VE/VEISelLowering.cpp                  |  2 +-
 .../lib/Target/WebAssembly/WebAssemblyISelLowering.cpp |  4 ++--
 llvm/lib/Target/X86/X86Subtarget.cpp                   |  4 ++--
 18 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index d7ce088cad49f..37df9589e30d6 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -241,7 +241,7 @@ class TargetMachine {
 
   bool isPositionIndependent() const;
 
-  bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const;
+  bool shouldAssumeDSOLocal(const GlobalValue *GV) const;
 
   /// Returns true if this target uses emulated TLS.
   bool useEmulatedTLS() const;
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index a2b5cbf7bad9f..4941d5b01ae0f 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -641,7 +641,7 @@ bool GlobalMergeImpl::run(Module &M) {
       continue;
 
     // It's not safe to merge globals that may be preempted
-    if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
+    if (TM && !TM->shouldAssumeDSOLocal(&GV))
       continue;
 
     if (!(Opt.MergeExternal && GV.hasExternalLinkage()) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a639cba5e35a8..b3dc9de713731 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -491,7 +491,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
   // If the address is not even local to this DSO we will have to load it from
   // a got and then add the offset.
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (!TM.shouldAssumeDSOLocal(GV))
     return false;
 
   // If the code is position independent we will have to add a base register.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 23b1deb3697f6..bb268b2ba926c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -398,7 +398,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   if (GV->isTagged())
     return AArch64II::MO_GOT;
 
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
+  if (!TM.shouldAssumeDSOLocal(GV)) {
     if (GV->hasDLLImportStorageClass()) {
       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
     }
@@ -435,8 +435,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
   // NonLazyBind goes via GOT unless we know it's available locally.
   auto *F = dyn_cast<Function>(GV);
   if ((!isTargetMachO() || MachOUseNonLazyBind) && F &&
-      F->hasFnAttribute(Attribute::NonLazyBind) &&
-      !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+      F->hasFnAttribute(Attribute::NonLazyBind) && !TM.shouldAssumeDSOLocal(GV))
     return AArch64II::MO_GOT;
 
   if (getTargetTriple().isOSWindows()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1889ab0072880..9bc1b8eb598f3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6219,8 +6219,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
   // address space for functions to avoid the explicit check.
   return (GV->getValueType()->isFunctionTy() ||
           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
-         !shouldEmitFixup(GV) &&
-         !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+         !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
 }
 
 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index dc81178311b6d..7ac49782ea846 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2655,12 +2655,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isDirect = false;
 
   const TargetMachine &TM = getTargetMachine();
-  const Module *Mod = MF.getFunction().getParent();
   const GlobalValue *GVal = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     GVal = G->getGlobal();
-  bool isStub =
-      !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
+  bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
 
   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
@@ -2737,7 +2735,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned TargetFlags = ARMII::MO_NO_FLAG;
         if (GVal->hasDLLImportStorageClass())
           TargetFlags = ARMII::MO_DLLIMPORT;
-        else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
+        else if (!TM.shouldAssumeDSOLocal(GVal))
           TargetFlags = ARMII::MO_COFFSTUB;
         Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
                                             TargetFlags);
@@ -4021,7 +4019,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
   ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
   if (GV->hasDLLImportStorageClass())
     TargetFlags = ARMII::MO_DLLIMPORT;
-  else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  else if (!TM.shouldAssumeDSOLocal(GV))
     TargetFlags = ARMII::MO_COFFSTUB;
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 717e61518c6ee..04ba20a17187b 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -353,7 +353,7 @@ bool ARMSubtarget::isRWPI() const {
 }
 
 bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (!TM.shouldAssumeDSOLocal(GV))
     return true;
 
   // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index 90f70b83a02d3..869277a391a56 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -649,8 +649,7 @@ SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = S->getGlobal();
-    bool IsLocal =
-        getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(GV);
 
     if (isPositionIndependent() || !Subtarget.has2E3()) {
       IsRegCall = true;
@@ -662,8 +661,7 @@ SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI,
           cast<GlobalAddressSDNode>(Callee), Ty, DAG, CSKYII::MO_None));
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(
-        *MF.getFunction().getParent(), nullptr);
+    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(nullptr);
 
     if (isPositionIndependent() || !Subtarget.has2E3()) {
       IsRegCall = true;
@@ -1153,7 +1151,7 @@ SDValue CSKYTargetLowering::LowerGlobalAddress(SDValue Op,
   int64_t Offset = N->getOffset();
 
   const GlobalValue *GV = N->getGlobal();
-  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(GV);
   SDValue Addr = getAddr<GlobalAddressSDNode, false>(N, DAG, IsLocal);
 
   // In order to maximise the opportunity for common subexpression elimination,
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index eda1150835a1f..41462cceef51d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1238,7 +1238,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
 
-  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(GV);
   if (UsePCRel) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
                                             HexagonII::MO_PCREL);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2d71423d6dd59..c87f5341d7fea 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -4251,14 +4251,12 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // split it and then direct call can be matched by PseudoCALL.
   if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = S->getGlobal();
-    unsigned OpFlags =
-        getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)
-            ? LoongArchII::MO_CALL
-            : LoongArchII::MO_CALL_PLT;
+    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(GV)
+                           ? LoongArchII::MO_CALL
+                           : LoongArchII::MO_CALL_PLT;
     Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, OpFlags);
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(
-                           *MF.getFunction().getParent(), nullptr)
+    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(nullptr)
                            ? LoongArchII::MO_CALL
                            : LoongArchII::MO_CALL_PLT;
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp
index 86e81cd08ea26..3af1e994c01cd 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.cpp
+++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -175,7 +175,7 @@ M68kSubtarget::classifyLocalReference(const GlobalValue *GV) const {
 }
 
 unsigned char M68kSubtarget::classifyExternalReference(const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, nullptr))
+  if (TM.shouldAssumeDSOLocal(nullptr))
     return classifyLocalReference(nullptr);
 
   if (isPositionIndependent())
@@ -191,7 +191,7 @@ M68kSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
 
 unsigned char M68kSubtarget::classifyGlobalReference(const GlobalValue *GV,
                                                      const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return classifyLocalReference(GV);
 
   switch (TM.getCodeModel()) {
@@ -240,7 +240,7 @@ unsigned char
 M68kSubtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
                                                const Module &M) const {
   // local always use pc-rel referencing
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return M68kII::MO_NO_FLAG;
 
   // If the function is marked as non-lazy, generate an indirect call
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 68c80dd9aa5c7..aef2d483c6df1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -4818,7 +4818,7 @@ static bool callsShareTOCBase(const Function *Caller,
   // If the callee is preemptable, then the static linker will use a plt-stub
   // which saves the toc to the stack, and needs a nop after the call
   // instruction to convert to a toc-restore.
-  if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), CalleeGV))
+  if (!TM.shouldAssumeDSOLocal(CalleeGV))
     return false;
 
   // Functions with PC Relative enabled may clobber the TOC in the same DSO.
@@ -5420,10 +5420,9 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
   // Returns true if the callee is local, and false otherwise.
   auto isLocalCallee = [&]() {
     const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
     const GlobalValue *GV = G ? G->getGlobal() : nullptr;
 
-    return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
+    return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
            !isa_and_nonnull<GlobalIFunc>(GV);
   };
 
@@ -18045,7 +18044,7 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
       return false;
 
   // If the function is local then we have a good chance at tail-calling it
-  return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
+  return getTargetMachine().shouldAssumeDSOLocal(Callee);
 }
 
 bool PPCTargetLowering::
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 2735bdee3bcfc..5380ec1c4c0d9 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -189,7 +189,7 @@ bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
     return true;
-  if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return false;
   return true;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 491bff7f3c30f..d0badd3692e40 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -122,7 +122,7 @@ bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
 
   // For the small model, all locally-binding symbols are in range.
   if (CM == CodeModel::Small)
-    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(GV);
 
   // For Medium and above, assume that the symbol is not within the 4GB range.
   // Taking the address of locally-defined text would be OK, but that
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 4258a76b54b92..8b177a89c9192 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -160,8 +160,7 @@ static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
   llvm_unreachable("invalid TLS model");
 }
 
-bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
-                                         const GlobalValue *GV) const {
+bool TargetMachine::shouldAssumeDSOLocal(const GlobalValue *GV) const {
   const Triple &TT = getTargetTriple();
   Reloc::Model RM = getRelocationModel();
 
@@ -225,7 +224,7 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
   Reloc::Model RM = getRelocationModel();
   bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
-  bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool IsLocal = shouldAssumeDSOLocal(GV);
 
   TLSModel::Model Model;
   if (IsSharedLibrary) {
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0e41a2d7aa03e..6e31c8b7c9a02 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -653,7 +653,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)
     GV = CalleeG->getGlobal();
-  bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+  bool Local = TM.shouldAssumeDSOLocal(GV);
   bool UsePlt = !Local;
   MachineFunction &MF = DAG.getMachineFunction();
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790d1e351..905ff3b901842 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1683,7 +1683,7 @@ WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   if (model == GlobalValue::LocalExecTLSModel ||
       model == GlobalValue::LocalDynamicTLSModel ||
       (model == GlobalValue::GeneralDynamicTLSModel &&
-       getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))) {
+       getTargetMachine().shouldAssumeDSOLocal(GV))) {
     // For DSO-local TLS variables we use offset from __tls_base
 
     MVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -1729,7 +1729,7 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
   // need special treatment for tables in PIC mode.
   if (isPositionIndependent() &&
       !WebAssembly::isWebAssemblyTableType(GV->getValueType())) {
-    if (getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) {
+    if (getTargetMachine().shouldAssumeDSOLocal(GV)) {
       MachineFunction &MF = DAG.getMachineFunction();
       MVT PtrVT = getPointerTy(MF.getDataLayout());
       const char *BaseName;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 07f535685e8f9..c2e6ddd7e7fa2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -140,7 +140,7 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     }
   }
 
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return classifyLocalReference(GV);
 
   if (isTargetCOFF()) {
@@ -190,7 +190,7 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
 unsigned char
 X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
                                               const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return X86II::MO_NO_FLAG;
 
   // Functions on COFF can be non-DSO local for three reasons:

From 6462eadbd316aed1b1074ed73bcaf1698886bba1 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:04:56 -0700
Subject: [PATCH 41/95] Report back errors in GetNumChildren() (#84265)

This is a proof-of-concept patch that illustrates how to use the
Expected return values to surface rich error messages all the way up
to the ValueObjectPrinter.

This is the final patch in the series that includes
https://github.com/llvm/llvm-project/pull/83501 and
https://github.com/llvm/llvm-project/pull/84219
---
 .../lldb/DataFormatters/ValueObjectPrinter.h  |  2 +-
 lldb/source/Core/ValueObjectVariable.cpp      |  3 ++-
 .../DataFormatters/ValueObjectPrinter.cpp     | 22 +++++++++++++++----
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  9 +++++---
 lldb/source/Symbol/CompilerType.cpp           |  3 ++-
 .../functionalities/valobj_errors/Makefile    |  9 ++++++++
 .../valobj_errors/TestValueObjectErrors.py    | 14 ++++++++++++
 .../functionalities/valobj_errors/hidden.c    |  4 ++++
 .../API/functionalities/valobj_errors/main.c  |  9 ++++++++
 .../x86/DW_AT_declaration-with-children.s     |  2 +-
 .../x86/debug-types-missing-signature.test    |  2 +-
 11 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 lldb/test/API/functionalities/valobj_errors/Makefile
 create mode 100644 lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py
 create mode 100644 lldb/test/API/functionalities/valobj_errors/hidden.c
 create mode 100644 lldb/test/API/functionalities/valobj_errors/main.c

diff --git a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
index fe46321c3186c..32b101a2f9843 100644
--- a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
+++ b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
@@ -127,7 +127,7 @@ class ValueObjectPrinter {
   void PrintChild(lldb::ValueObjectSP child_sp,
                   const DumpValueObjectOptions::PointerDepth &curr_ptr_depth);
 
-  uint32_t GetMaxNumChildrenToPrint(bool &print_dotdotdot);
+  llvm::Expected<uint32_t> GetMaxNumChildrenToPrint(bool &print_dotdotdot);
 
   void
   PrintChildren(bool value_printed, bool summary_printed,
diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp
index fb29c22c0ab5a..67d71c90a959d 100644
--- a/lldb/source/Core/ValueObjectVariable.cpp
+++ b/lldb/source/Core/ValueObjectVariable.cpp
@@ -99,7 +99,8 @@ ValueObjectVariable::CalculateNumChildren(uint32_t max) {
   CompilerType type(GetCompilerType());
 
   if (!type.IsValid())
-    return 0;
+    return llvm::make_error<llvm::StringError>("invalid type",
+                                               llvm::inconvertibleErrorCode());
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
   const bool omit_empty_base_classes = true;
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index b853199e878c9..bbdc2a9981570 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -621,13 +621,17 @@ void ValueObjectPrinter::PrintChild(
   }
 }
 
-uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
+llvm::Expected<uint32_t>
+ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (m_options.m_pointer_as_array)
     return m_options.m_pointer_as_array.m_element_count;
 
-  uint32_t num_children = synth_valobj.GetNumChildrenIgnoringErrors();
+  auto num_children_or_err = synth_valobj.GetNumChildren();
+  if (!num_children_or_err)
+    return num_children_or_err;
+  uint32_t num_children = *num_children_or_err;
   print_dotdotdot = false;
   if (num_children) {
     const size_t max_num_children = GetMostSpecializedValue()
@@ -704,7 +708,12 @@ void ValueObjectPrinter::PrintChildren(
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
-  size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
+  auto num_children_or_err = GetMaxNumChildrenToPrint(print_dotdotdot);
+  if (!num_children_or_err) {
+    *m_stream << " <" << llvm::toString(num_children_or_err.takeError()) << '>';
+    return;
+  }
+  uint32_t num_children = *num_children_or_err;
   if (num_children) {
     bool any_children_printed = false;
 
@@ -753,7 +762,12 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
-  size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
+  auto num_children_or_err = GetMaxNumChildrenToPrint(print_dotdotdot);
+  if (!num_children_or_err) {
+    *m_stream << '<' << llvm::toString(num_children_or_err.takeError()) << '>';
+    return true;
+  }
+  uint32_t num_children = *num_children_or_err;
 
   if (num_children) {
     m_stream->PutChar('(');
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index c02b08cb47828..68d9165b90a47 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5268,7 +5268,8 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
                                 bool omit_empty_base_classes,
                                 const ExecutionContext *exe_ctx) {
   if (!type)
-    return 0;
+    return llvm::make_error<llvm::StringError>("invalid clang type",
+                                               llvm::inconvertibleErrorCode());
 
   uint32_t num_children = 0;
   clang::QualType qual_type(RemoveWrappingTypes(GetQualType(type)));
@@ -5325,9 +5326,11 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       }
       num_children += std::distance(record_decl->field_begin(),
                                record_decl->field_end());
-    }
+    } else
+      return llvm::make_error<llvm::StringError>(
+          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"",
+          llvm::inconvertibleErrorCode());
     break;
-
   case clang::Type::ObjCObject:
   case clang::Type::ObjCInterface:
     if (GetCompleteQualType(&getASTContext(), qual_type)) {
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 85dd2d841a5a0..8e4c3c761f784 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -777,7 +777,8 @@ CompilerType::GetNumChildren(bool omit_empty_base_classes,
     if (auto type_system_sp = GetTypeSystem())
       return type_system_sp->GetNumChildren(m_type, omit_empty_base_classes,
                                        exe_ctx);
-  return 0;
+  return llvm::make_error<llvm::StringError>("invalid type",
+                                             llvm::inconvertibleErrorCode());
 }
 
 lldb::BasicType CompilerType::GetBasicTypeEnumeration() const {
diff --git a/lldb/test/API/functionalities/valobj_errors/Makefile b/lldb/test/API/functionalities/valobj_errors/Makefile
new file mode 100644
index 0000000000000..d2c966a71411b
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/Makefile
@@ -0,0 +1,9 @@
+C_SOURCES := main.c
+LD_EXTRAS = hidden.o
+
+a.out: hidden.o
+
+hidden.o: hidden.c
+	$(CC) -g0 -c -o $@ $<
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py b/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py
new file mode 100644
index 0000000000000..8a114005c493b
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py
@@ -0,0 +1,14 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class ValueObjectErrorsTestCase(TestBase):
+    def test(self):
+        """Test that the error message for a missing type
+        is visible when printing an object"""
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "break here",
+                                          lldb.SBFileSpec('main.c'))
+        self.expect('v -ptr-depth 1 x', substrs=['<incomplete type "Opaque">'])
diff --git a/lldb/test/API/functionalities/valobj_errors/hidden.c b/lldb/test/API/functionalities/valobj_errors/hidden.c
new file mode 100644
index 0000000000000..d3b93ce1ab9cf
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/hidden.c
@@ -0,0 +1,4 @@
+struct Opaque {
+  int i, j, k;
+} *global;
+struct Opaque *getOpaque() { return &global; }
diff --git a/lldb/test/API/functionalities/valobj_errors/main.c b/lldb/test/API/functionalities/valobj_errors/main.c
new file mode 100644
index 0000000000000..fabdca9d3a2ec
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/main.c
@@ -0,0 +1,9 @@
+struct Opaque;
+struct Opaque *getOpaque();
+void puts(const char *);
+
+int main() {
+  struct Opaque *x = getOpaque();
+  puts("break here\n");
+  return (int)x;
+}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s b/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
index bc462ca32e9ce..8633d02f492e6 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
@@ -12,7 +12,7 @@
 target var a
 # CHECK-LABEL: target var a
 # FIXME: This should also produce some kind of an error.
-# CHECK: (A) a = {}
+# CHECK: (A) a = <incomplete type "A">
 expr a
 # CHECK-LABEL: expr a
 # CHECK: incomplete type 'A' where a complete type is required
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
index e94b10a68d4e9..548dd6cdbc275 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
@@ -21,6 +21,6 @@ RUN: not %lldb %t -b -o "expression (EC) 1" 2>&1 | FileCheck --check-prefix=PRIN
 PRINTEC: use of undeclared identifier 'EC'
 
 RUN: %lldb %t -b -o "target variable a e ec" | FileCheck --check-prefix=VARS %s
-VARS: (const (unnamed struct)) a = {}
+VARS: (const (unnamed struct)) a = <incomplete type "const (unnamed struct)">
 VARS: (const (unnamed enum)) e = 0x1
 VARS: (const (unnamed enum)) ec = 0x1

From 4628e33a7762384180a72cc9074a7ec49fbbdb95 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Mon, 11 Mar 2024 13:21:00 -0700
Subject: [PATCH 42/95] [NFC][docs] Rename duplicate label to something unique

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b70220dec9261..77ec72f176d6e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -143,7 +143,7 @@ It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
 that defines the type and name of value produced.
 
-.. _strings:
+.. _string_constants:
 
 String constants
 ----------------

From 4d21e75210d936d4f05e8aa9ea33beb552cd19b1 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 11 Mar 2024 16:27:42 -0400
Subject: [PATCH 43/95] [libc][math][c23] Add fmodl and fmodf128 math
 functions. (#84600)

- Allow `FMod` template to have different computational types and make
it work for 80-bit long double.
- Switch to use `uint64_t` as the intermediate computational types for
`float`, significantly reduce the latency of `fmodf` when the exponent
difference is large.
---
 libc/config/linux/aarch64/entrypoints.txt     |   2 +
 libc/config/linux/riscv/entrypoints.txt       |   2 +
 libc/config/linux/x86_64/entrypoints.txt      |   2 +
 libc/config/windows/entrypoints.txt           |   1 +
 libc/docs/math/index.rst                      |   4 +-
 libc/spec/stdc.td                             |   3 +-
 libc/src/__support/FPUtil/FPBits.h            |  17 ++-
 libc/src/__support/FPUtil/generic/FMod.h      | 143 +++++++-----------
 libc/src/math/CMakeLists.txt                  |   2 +
 libc/src/math/fmodf128.h                      |  20 +++
 libc/src/math/fmodl.h                         |  18 +++
 libc/src/math/generic/CMakeLists.txt          |  27 +++-
 libc/src/math/generic/fmodf.cpp               |   2 +-
 libc/src/math/generic/fmodf128.cpp            |  19 +++
 libc/src/math/generic/fmodl.cpp               |  19 +++
 .../exhaustive/fmod_generic_impl_test.cpp     |   9 +-
 .../BinaryOpSingleOutputPerf.h                |   2 +-
 .../math/performance_testing/CMakeLists.txt   |  22 +++
 .../performance_testing/fmodf128_perf.cpp     |  16 ++
 .../math/performance_testing/fmodl_perf.cpp   |  16 ++
 libc/test/src/math/smoke/CMakeLists.txt       |  36 +++++
 libc/test/src/math/smoke/fmodf128_test.cpp    |  13 ++
 libc/test/src/math/smoke/fmodl_test.cpp       |  13 ++
 23 files changed, 303 insertions(+), 105 deletions(-)
 create mode 100644 libc/src/math/fmodf128.h
 create mode 100644 libc/src/math/fmodl.h
 create mode 100644 libc/src/math/generic/fmodf128.cpp
 create mode 100644 libc/src/math/generic/fmodl.cpp
 create mode 100644 libc/test/src/math/performance_testing/fmodf128_perf.cpp
 create mode 100644 libc/test/src/math/performance_testing/fmodl_perf.cpp
 create mode 100644 libc/test/src/math/smoke/fmodf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/fmodl_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index b447b5dfe0989..1656973cb27c8 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -335,6 +335,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -426,6 +427,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5175b14adf2e7..07d1acfcfe079 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -343,6 +343,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -434,6 +435,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b8bec14a3d2a6..e0324061a9c78 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -376,6 +376,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -469,6 +470,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 1c9ed7bbcfed6..d6227a427afe2 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -155,6 +155,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index b22ed5127c179..6984b785125f1 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -169,7 +169,9 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmodf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmodl        |         |         |         |         |         |         |         |         |         |         |         |         |
+| fmodl        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| fmodf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | frexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index d91f5c1f72334..1f14fe758130e 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -405,8 +405,9 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-
           FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
+          FunctionSpec<"fmodl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"fmodf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 7b3882dde1b72..b06b3f7b73959 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -640,6 +640,7 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
   using UP::EXP_MASK;
   using UP::FRACTION_MASK;
   using UP::SIG_LEN;
+  using UP::SIG_MASK;
   using UP::SIGN_MASK;
   LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
       (1 << UP::EXP_LEN) - 1;
@@ -729,6 +730,9 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
     bits = UP::merge(bits, mantVal, FRACTION_MASK);
   }
 
+  LIBC_INLINE constexpr void set_significand(StorageType sigVal) {
+    bits = UP::merge(bits, sigVal, SIG_MASK);
+  }
   // Unsafe function to create a floating point representation.
   // It simply packs the sign, biased exponent and mantissa values without
   // checking bound nor normalization.
@@ -755,20 +759,19 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
   //   4) "number" zero value is not processed correctly.
   //   5) Number is unsigned, so the result can be only positive.
   LIBC_INLINE static constexpr RetT make_value(StorageType number, int ep) {
-    static_assert(fp_type != FPType::X86_Binary80,
-                  "This function is not tested for X86 Extended Precision");
-    FPRepImpl result;
-    // offset: +1 for sign, but -1 for implicit first bit
-    int lz = cpp::countl_zero(number) - UP::EXP_LEN;
+    FPRepImpl result(0);
+    int lz =
+        UP::FRACTION_LEN + 1 - (UP::STORAGE_LEN - cpp::countl_zero(number));
+
     number <<= lz;
     ep -= lz;
 
     if (LIBC_LIKELY(ep >= 0)) {
       // Implicit number bit will be removed by mask
-      result.set_mantissa(number);
+      result.set_significand(number);
       result.set_biased_exponent(ep + 1);
     } else {
-      result.set_mantissa(number >> -ep);
+      result.set_significand(number >> -ep);
     }
     return RetT(result.uintval());
   }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index 2d31290bc4bc2..24fb264b779b7 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -117,63 +117,9 @@ namespace generic {
 // be implemented in another handler.
 // Signaling NaN converted to quiet NaN with FE_INVALID exception.
 //    https://www.open-std.org/JTC1/SC22/WG14/www/docs/n1011.htm
-template <typename T> struct FModExceptionalInputHandler {
-
-  static_assert(cpp::is_floating_point_v<T>,
-                "FModCStandardWrapper instantiated with invalid type.");
-
-  LIBC_INLINE static bool pre_check(T x, T y, T &out) {
-    using FPB = fputil::FPBits<T>;
-    const T quiet_nan = FPB::quiet_nan().get_val();
-    FPB sx(x), sy(y);
-    if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
-                    !sx.is_inf_or_nan())) {
-      return false;
-    }
-
-    if (sx.is_nan() || sy.is_nan()) {
-      if ((sx.is_nan() && !sx.is_quiet_nan()) ||
-          (sy.is_nan() && !sy.is_quiet_nan()))
-        fputil::raise_except_if_required(FE_INVALID);
-      out = quiet_nan;
-      return true;
-    }
-
-    if (sx.is_inf() || sy.is_zero()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      fputil::set_errno_if_required(EDOM);
-      out = quiet_nan;
-      return true;
-    }
-
-    if (sy.is_inf()) {
-      out = x;
-      return true;
-    }
-
-    // case where x == 0
-    out = x;
-    return true;
-  }
-};
-
-template <typename T> struct FModFastMathWrapper {
-
-  static_assert(cpp::is_floating_point_v<T>,
-                "FModFastMathWrapper instantiated with invalid type.");
-
-  static bool pre_check(T, T, T &) { return false; }
-};
-
-template <typename T> class FModDivisionSimpleHelper {
-private:
-  using StorageType = typename FPBits<T>::StorageType;
-
-public:
-  LIBC_INLINE constexpr static StorageType execute(int exp_diff,
-                                                   int sides_zeroes_count,
-                                                   StorageType m_x,
-                                                   StorageType m_y) {
+template <typename T> struct FModDivisionSimpleHelper {
+  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
+                                         T m_x, T m_y) {
     while (exp_diff > sides_zeroes_count) {
       exp_diff -= sides_zeroes_count;
       m_x <<= sides_zeroes_count;
@@ -185,28 +131,21 @@ template <typename T> class FModDivisionSimpleHelper {
   }
 };
 
-template <typename T> class FModDivisionInvMultHelper {
-private:
-  using FPB = FPBits<T>;
-  using StorageType = typename FPB::StorageType;
-
-public:
-  LIBC_INLINE constexpr static StorageType execute(int exp_diff,
-                                                   int sides_zeroes_count,
-                                                   StorageType m_x,
-                                                   StorageType m_y) {
+template <typename T> struct FModDivisionInvMultHelper {
+  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
+                                         T m_x, T m_y) {
+    constexpr int LENGTH = sizeof(T) * CHAR_BIT;
     if (exp_diff > sides_zeroes_count) {
-      StorageType inv_hy = (cpp::numeric_limits<StorageType>::max() / m_y);
+      T inv_hy = (cpp::numeric_limits<T>::max() / m_y);
       while (exp_diff > sides_zeroes_count) {
         exp_diff -= sides_zeroes_count;
-        StorageType hd =
-            (m_x * inv_hy) >> (FPB::TOTAL_LEN - sides_zeroes_count);
+        T hd = (m_x * inv_hy) >> (LENGTH - sides_zeroes_count);
         m_x <<= sides_zeroes_count;
         m_x -= hd * m_y;
         while (LIBC_UNLIKELY(m_x > m_y))
           m_x -= m_y;
       }
-      StorageType hd = (m_x * inv_hy) >> (FPB::TOTAL_LEN - exp_diff);
+      T hd = (m_x * inv_hy) >> (LENGTH - exp_diff);
       m_x <<= exp_diff;
       m_x -= hd * m_y;
       while (LIBC_UNLIKELY(m_x > m_y))
@@ -219,22 +158,49 @@ template <typename T> class FModDivisionInvMultHelper {
   }
 };
 
-template <typename T, class Wrapper = FModExceptionalInputHandler<T>,
-          class DivisionHelper = FModDivisionSimpleHelper<T>>
+template <typename T, typename U = typename FPBits<T>::StorageType,
+          typename DivisionHelper = FModDivisionSimpleHelper<U>>
 class FMod {
-  static_assert(cpp::is_floating_point_v<T>,
+  static_assert(cpp::is_floating_point_v<T> && cpp::is_unsigned_v<U> &&
+                    (sizeof(U) * CHAR_BIT > FPBits<T>::FRACTION_LEN),
                 "FMod instantiated with invalid type.");
 
 private:
   using FPB = FPBits<T>;
   using StorageType = typename FPB::StorageType;
 
+  LIBC_INLINE static bool pre_check(T x, T y, T &out) {
+    using FPB = fputil::FPBits<T>;
+    const T quiet_nan = FPB::quiet_nan().get_val();
+    FPB sx(x), sy(y);
+    if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
+                    !sx.is_inf_or_nan()))
+      return false;
+
+    if (sx.is_nan() || sy.is_nan()) {
+      if (sx.is_signaling_nan() || sy.is_signaling_nan())
+        fputil::raise_except_if_required(FE_INVALID);
+      out = quiet_nan;
+      return true;
+    }
+
+    if (sx.is_inf() || sy.is_zero()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      fputil::set_errno_if_required(EDOM);
+      out = quiet_nan;
+      return true;
+    }
+
+    out = x;
+    return true;
+  }
+
   LIBC_INLINE static constexpr FPB eval_internal(FPB sx, FPB sy) {
 
     if (LIBC_LIKELY(sx.uintval() <= sy.uintval())) {
       if (sx.uintval() < sy.uintval())
         return sx;             // |x|<|y| return x
-      return FPB(FPB::zero()); // |x|=|y| return 0.0
+      return FPB::zero();      // |x|=|y| return 0.0
     }
 
     int e_x = sx.get_biased_exponent();
@@ -247,11 +213,11 @@ class FMod {
       StorageType m_y = sy.get_explicit_mantissa();
       StorageType d = (e_x == e_y) ? (m_x - m_y) : (m_x << (e_x - e_y)) % m_y;
       if (d == 0)
-        return FPB(FPB::zero());
+        return FPB::zero();
       // iy - 1 because of "zero power" for number with power 1
       return FPB::make_value(d, e_y - 1);
     }
-    /* Both subnormal special case. */
+    // Both subnormal special case.
     if (LIBC_UNLIKELY(e_x == 0 && e_y == 0)) {
       FPB d;
       d.set_mantissa(sx.uintval() % sy.uintval());
@@ -259,15 +225,17 @@ class FMod {
     }
 
     // Note that hx is not subnormal by conditions above.
-    StorageType m_x = sx.get_explicit_mantissa();
+    U m_x = static_cast<U>(sx.get_explicit_mantissa());
     e_x--;
 
-    StorageType m_y = sy.get_explicit_mantissa();
-    int lead_zeros_m_y = FPB::EXP_LEN;
+    U m_y = static_cast<U>(sy.get_explicit_mantissa());
+    constexpr int DEFAULT_LEAD_ZEROS =
+        sizeof(U) * CHAR_BIT - FPB::FRACTION_LEN - 1;
+    int lead_zeros_m_y = DEFAULT_LEAD_ZEROS;
     if (LIBC_LIKELY(e_y > 0)) {
       e_y--;
     } else {
-      m_y = sy.get_mantissa();
+      m_y = static_cast<U>(sy.get_mantissa());
       lead_zeros_m_y = cpp::countl_zero(m_y);
     }
 
@@ -286,26 +254,27 @@ class FMod {
 
     {
       // Shift hx left until the end or n = 0
-      int left_shift = exp_diff < int(FPB::EXP_LEN) ? exp_diff : FPB::EXP_LEN;
+      int left_shift =
+          exp_diff < DEFAULT_LEAD_ZEROS ? exp_diff : DEFAULT_LEAD_ZEROS;
       m_x <<= left_shift;
       exp_diff -= left_shift;
     }
 
     m_x %= m_y;
     if (LIBC_UNLIKELY(m_x == 0))
-      return FPB(FPB::zero());
+      return FPB::zero();
 
     if (exp_diff == 0)
-      return FPB::make_value(m_x, e_y);
+      return FPB::make_value(static_cast<StorageType>(m_x), e_y);
 
-    /* hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0 */
+    // hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0
     m_x = DivisionHelper::execute(exp_diff, sides_zeroes_count, m_x, m_y);
-    return FPB::make_value(m_x, e_y);
+    return FPB::make_value(static_cast<StorageType>(m_x), e_y);
   }
 
 public:
   LIBC_INLINE static T eval(T x, T y) {
-    if (T out; Wrapper::pre_check(x, y, out))
+    if (T out; LIBC_UNLIKELY(pre_check(x, y, out)))
       return out;
     FPB sx(x), sy(y);
     Sign sign = sx.sign();
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 6c06d383ec2b0..bba02aa78a231 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -119,6 +119,8 @@ add_math_entrypoint_object(fminf128)
 
 add_math_entrypoint_object(fmod)
 add_math_entrypoint_object(fmodf)
+add_math_entrypoint_object(fmodl)
+add_math_entrypoint_object(fmodf128)
 
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
diff --git a/libc/src/math/fmodf128.h b/libc/src/math/fmodf128.h
new file mode 100644
index 0000000000000..b3242705f025e
--- /dev/null
+++ b/libc/src/math/fmodf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fmodf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODF128_H
+#define LLVM_LIBC_SRC_MATH_FMODF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 fmodf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMODF128_H
diff --git a/libc/src/math/fmodl.h b/libc/src/math/fmodl.h
new file mode 100644
index 0000000000000..f259ddb238a8e
--- /dev/null
+++ b/libc/src/math/fmodl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmodl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODL_H
+#define LLVM_LIBC_SRC_MATH_FMODL_H
+
+namespace LIBC_NAMESPACE {
+
+long double fmodl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMODL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 933a05dad157c..bc4e9b34cfc2f 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1859,7 +1859,6 @@ add_entrypoint_object(
   HDRS
     ../fmod.h
   DEPENDS
-    libc.include.math
     libc.src.__support.FPUtil.generic.fmod
   COMPILE_OPTIONS
     -O3
@@ -1872,7 +1871,31 @@ add_entrypoint_object(
   HDRS
     ../fmodf.h
   DEPENDS
-    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodl
+  SRCS
+    fmodl.cpp
+  HDRS
+    ../fmodl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodf128
+  SRCS
+    fmodf128.cpp
+  HDRS
+    ../fmodf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.generic.fmod
   COMPILE_OPTIONS
     -O3
diff --git a/libc/src/math/generic/fmodf.cpp b/libc/src/math/generic/fmodf.cpp
index 7a29ff1f18d31..9a9e46e29b466 100644
--- a/libc/src/math/generic/fmodf.cpp
+++ b/libc/src/math/generic/fmodf.cpp
@@ -13,7 +13,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
-  return fputil::generic::FMod<float>::eval(x, y);
+  return fputil::generic::FMod<float, uint64_t>::eval(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fmodf128.cpp b/libc/src/math/generic/fmodf128.cpp
new file mode 100644
index 0000000000000..08a379702d889
--- /dev/null
+++ b/libc/src/math/generic/fmodf128.cpp
@@ -0,0 +1,19 @@
+//===-- Single-precision fmodf128 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf128.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, fmodf128, (float128 x, float128 y)) {
+  return fputil::generic::FMod<float128>::eval(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fmodl.cpp b/libc/src/math/generic/fmodl.cpp
new file mode 100644
index 0000000000000..23a3702890557
--- /dev/null
+++ b/libc/src/math/generic/fmodl.cpp
@@ -0,0 +1,19 @@
+//===-- Single-precision fmodl function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodl.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, fmodl, (long double x, long double y)) {
+  return fputil::generic::FMod<long double>::eval(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
index b47d24c54869b..25a5e3898599a 100644
--- a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@@ -19,10 +19,11 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 template <typename T, bool InverseMultiplication>
 class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
+  using U = typename LIBC_NAMESPACE::fputil::FPBits<T>::StorageType;
   using DivisionHelper = LIBC_NAMESPACE::cpp::conditional_t<
       InverseMultiplication,
-      LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<T>,
-      LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<T>>;
+      LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<U>,
+      LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<U>>;
 
   static constexpr std::array<T, 11> test_bases = {
       T(0.0),
@@ -39,9 +40,7 @@ class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
 public:
   void testExtensive() {
-    using FMod = LIBC_NAMESPACE::fputil::generic::FMod<
-        T, LIBC_NAMESPACE::fputil::generic::FModFastMathWrapper<T>,
-        DivisionHelper>;
+    using FMod = LIBC_NAMESPACE::fputil::generic::FMod<T, U, DivisionHelper>;
     using nl = std::numeric_limits<T>;
     int min2 = nl::min_exponent - nl::digits - 5;
     int max2 = nl::max_exponent + 3;
diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 68d37b46b77c7..504d1be94b891 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -86,7 +86,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
            "close to each other:\n";
     run_perf_in_range(
         myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 10'000'001, log);
+        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 1'001'001, log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index d20c2eb303a7c..d1fb24e37f728 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -331,3 +331,25 @@ add_perf_binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_perf_binary(
+  fmodl_perf
+  SRCS
+    fmodl_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodl
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_perf_binary(
+  fmodf128_perf
+  SRCS
+    fmodf128_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodf128
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/fmodf128_perf.cpp b/libc/test/src/math/performance_testing/fmodf128_perf.cpp
new file mode 100644
index 0000000000000..8165e9254dd56
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmodf128_perf.cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodf128 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf128.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::fmodf128, ::fmodf128,
+                             "fmodf128_perf.log")
diff --git a/libc/test/src/math/performance_testing/fmodl_perf.cpp b/libc/test/src/math/performance_testing/fmodl_perf.cpp
new file mode 100644
index 0000000000000..aefdf2d6b42fc
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmodl_perf.cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodl ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodl.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(long double, LIBC_NAMESPACE::fmodl, ::fmodl,
+                             "fmodl_perf.log")
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 8d3871dd427aa..d9be172056a83 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1793,6 +1793,42 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  fmodl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmodl_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.errno.errno
+    libc.src.math.fmodl
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.nearest_integer_operations
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
+add_fp_unittest(
+  fmodf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmodf128_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.errno.errno
+    libc.src.math.fmodf128
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.nearest_integer_operations
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   coshf_test
   SUITE
diff --git a/libc/test/src/math/smoke/fmodf128_test.cpp b/libc/test/src/math/smoke/fmodf128_test.cpp
new file mode 100644
index 0000000000000..f75aadac84386
--- /dev/null
+++ b/libc/test/src/math/smoke/fmodf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodf128.h"
+
+LIST_FMOD_TESTS(float128, LIBC_NAMESPACE::fmodf128)
diff --git a/libc/test/src/math/smoke/fmodl_test.cpp b/libc/test/src/math/smoke/fmodl_test.cpp
new file mode 100644
index 0000000000000..b69ed8ec85c84
--- /dev/null
+++ b/libc/test/src/math/smoke/fmodl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodl.h"
+
+LIST_FMOD_TESTS(long double, LIBC_NAMESPACE::fmodl)

From f4c1e8747b33815969e60a53cab3dac4d0f55f6c Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Mon, 11 Mar 2024 16:28:05 -0400
Subject: [PATCH 44/95] [libc++][hardening] Reclassify string_view(ptr, len)'s
 size assertion (#79297)

The comment makes this error condition sound less problematic than it
is. If the length does not match the pointer's bounds, all
bounds-checking in string_view goes wrong. A length over PTRDIFF_MAX
cannot possibly be a correct bounds and was mostly an underflowed
negative number cast to a size_t.

The documentation for _LIBCPP_ASSERT_VALID_INPUT_RANGE discusses ranges
being valid, including an iterator and a count, which seemed appropriate
here.
---
 libcxx/include/string_view | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index e0dd5c5b19ace..e8584a69c1e1b 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -310,9 +310,10 @@ public:
       : __data_(__s),
         __size_(__len) {
 #if _LIBCPP_STD_VER >= 14
-    // This will result in creating an invalid `string_view` object -- some calculations involving `size` would
-    // overflow, making it effectively truncated.
-    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+    // Allocations must fit in `ptrdiff_t` for pointer arithmetic to work. If `__len` exceeds it, the input
+    // range could not have been valid. Most likely the caller underflowed some arithmetic and inadvertently
+    // passed in a negative length.
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __len <= static_cast<size_type>(numeric_limits<difference_type>::max()),
         "string_view::string_view(_CharT *, size_t): length does not fit in difference_type");
     _LIBCPP_ASSERT_NON_NULL(

From f832beebda6d31fef01a8cb680b82df33c666eef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 11 Mar 2024 13:41:27 -0700
Subject: [PATCH 45/95] [flang][NFC] Use the tablegen definition for FIR
 dialect (#84822)

FIROpsDialect has been declared manually with a class inheriting from
the MLIR Dialect class. Another declaration is done using tablegen here
`flang/include/flang/Optimizer/Dialect/FIRDialect.td`. This patch merge
the two declaration so we can use the tablegen generated class for all
the FIROpsDialect needs.

This is part of a series of patch to bring FIR up to date with the
current MLIR infra.
---
 .../flang/Optimizer/Dialect/CMakeLists.txt    |  4 +++
 .../flang/Optimizer/Dialect/FIRAttr.td        |  6 ++--
 .../flang/Optimizer/Dialect/FIRDialect.h      | 33 ++-----------------
 .../flang/Optimizer/Dialect/FIRDialect.td     | 29 ++++++++++++++--
 .../include/flang/Optimizer/Dialect/FIROps.td |  2 +-
 .../flang/Optimizer/Dialect/FIRTypes.td       |  2 +-
 flang/lib/Optimizer/Dialect/FIRDialect.cpp    | 11 ++-----
 7 files changed, 40 insertions(+), 47 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index fe9864a26295d..f00993d4d3778 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -1,6 +1,10 @@
 # This replicates part of the add_mlir_dialect cmake function from MLIR that
 # cannot be used her because it expects to be run inside MLIR directory which
 # is not the case for FIR.
+set(LLVM_TARGET_DEFINITIONS FIRDialect.td)
+mlir_tablegen(FIRDialect.h.inc -gen-dialect-decls -dialect=fir)
+mlir_tablegen(FIRDialect.cpp.inc -gen-dialect-defs -dialect=fir)
+
 set(LLVM_TARGET_DEFINITIONS FIRAttr.td)
 mlir_tablegen(FIREnumAttr.h.inc -gen-enum-decls)
 mlir_tablegen(FIREnumAttr.cpp.inc -gen-enum-defs)
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 66d6cd471116b..2ac4af9e66aa8 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -16,7 +16,7 @@
 include "flang/Optimizer/Dialect/FIRDialect.td"
 include "mlir/IR/EnumAttr.td"
 
-class fir_Attr<string name> : AttrDef<fir_Dialect, name>;
+class fir_Attr<string name> : AttrDef<FIROpsDialect, name>;
 
 def FIRnoAttributes  : I32BitEnumAttrCaseNone<"None">;
 def FIRallocatable  : I32BitEnumAttrCaseBit<"allocatable", 0>;
@@ -91,7 +91,7 @@ def fir_CUDADataAttribute : I32EnumAttr<
 }
 
 def fir_CUDADataAttributeAttr :
-    EnumAttr<fir_Dialect, fir_CUDADataAttribute, "cuda"> {
+    EnumAttr<FIROpsDialect, fir_CUDADataAttribute, "cuda"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
@@ -109,7 +109,7 @@ def fir_CUDAProcAttribute : I32EnumAttr<
 }
 
 def fir_CUDAProcAttributeAttr :
-    EnumAttr<fir_Dialect, fir_CUDAProcAttribute, "cuda_proc"> {
+    EnumAttr<FIROpsDialect, fir_CUDAProcAttribute, "cuda_proc"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.h b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
index 238385505dbff..ed7c98ec82e2d 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRDialect.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
@@ -15,43 +15,14 @@
 
 #include "mlir/IR/Dialect.h"
 
+#include "flang/Optimizer/Dialect/FIRDialect.h.inc"
+
 namespace mlir {
 class IRMapping;
 } // namespace mlir
 
 namespace fir {
 
-/// FIR dialect
-class FIROpsDialect final : public mlir::Dialect {
-public:
-  explicit FIROpsDialect(mlir::MLIRContext *ctx);
-  virtual ~FIROpsDialect();
-
-  static llvm::StringRef getDialectNamespace() { return "fir"; }
-
-  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
-  void printType(mlir::Type ty, mlir::DialectAsmPrinter &p) const override;
-
-  mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
-                                 mlir::Type type) const override;
-  void printAttribute(mlir::Attribute attr,
-                      mlir::DialectAsmPrinter &p) const override;
-
-  /// Return string name of fir.runtime attribute.
-  static constexpr llvm::StringRef getFirRuntimeAttrName() {
-    return "fir.runtime";
-  }
-
-private:
-  // Register the Attributes of this dialect.
-  void registerAttributes();
-  // Register the Types of this dialect.
-  void registerTypes();
-  // Register external interfaces on operations of
-  // this dialect.
-  void registerOpExternalInterfaces();
-};
-
 /// The FIR codegen dialect is a dialect containing a small set of transient
 /// operations used exclusively during code generation.
 class FIRCodeGenDialect final : public mlir::Dialect {
diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.td b/flang/include/flang/Optimizer/Dialect/FIRDialect.td
index b366b6d40e4e2..0dfb3eda585ce 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRDialect.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.td
@@ -21,7 +21,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-def fir_Dialect : Dialect {
+def FIROpsDialect : Dialect {
   let name = "fir";
   let cppNamespace = "::fir";
   let useDefaultTypePrinterParser = 0;
@@ -30,10 +30,33 @@ def fir_Dialect : Dialect {
   let dependentDialects = [
     // Arith dialect provides FastMathFlagsAttr
     // supported by some FIR operations.
-    "arith::ArithDialect",
+    "mlir::arith::ArithDialect",
     // TBAA Tag types
-    "LLVM::LLVMDialect"
+    "mlir::LLVM::LLVMDialect"
   ];
+  let extraClassDeclaration = [{
+  private:
+    // Register the builtin Attributes.
+    void registerAttributes();
+    // Register the builtin Types.
+    void registerTypes();
+    // Register external interfaces on operations of
+    // this dialect.
+    void registerOpExternalInterfaces();
+  public:
+    mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+    void printType(mlir::Type ty, mlir::DialectAsmPrinter &p) const override;
+ 
+    mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
+                                   mlir::Type type) const override;
+    void printAttribute(mlir::Attribute attr,
+                        mlir::DialectAsmPrinter &p) const override;
+
+    // Return string name of fir.runtime attribute.
+    static constexpr llvm::StringRef getFirRuntimeAttrName() {
+      return "fir.runtime";
+    }
+  }];
 }
 
 #endif // FORTRAN_DIALECT_FIR_DIALECT
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index db5e5f4bc682e..65a86d25333b5 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -27,7 +27,7 @@ include "mlir/IR/BuiltinAttributes.td"
 // Base class for FIR operations.
 // All operations automatically get a prefix of "fir.".
 class fir_Op<string mnemonic, list<Trait> traits>
-  : Op<fir_Dialect, mnemonic, traits>;
+  : Op<FIROpsDialect, mnemonic, traits>;
 
 // Base class for FIR operations that take a single argument
 class fir_SimpleOp<string mnemonic, list<Trait> traits>
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 2a2f50720859e..4c6a8064991ab 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -22,7 +22,7 @@ include "flang/Optimizer/Dialect/FIRDialect.td"
 
 class FIR_Type<string name, string typeMnemonic, list<Trait> traits = [],
                string baseCppClass = "::mlir::Type">
-    : TypeDef<fir_Dialect, name, traits, baseCppClass> {
+    : TypeDef<FIROpsDialect, name, traits, baseCppClass> {
   let mnemonic = typeMnemonic;
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIRDialect.cpp b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
index 850b6120b2a00..4d1e8cd1405af 100644
--- a/flang/lib/Optimizer/Dialect/FIRDialect.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
@@ -18,6 +18,8 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/InliningUtils.h"
 
+#include "flang/Optimizer/Dialect/FIRDialect.cpp.inc"
+
 using namespace fir;
 
 namespace {
@@ -58,9 +60,7 @@ struct FIRInlinerInterface : public mlir::DialectInlinerInterface {
 };
 } // namespace
 
-fir::FIROpsDialect::FIROpsDialect(mlir::MLIRContext *ctx)
-    : mlir::Dialect("fir", ctx, mlir::TypeID::get<FIROpsDialect>()) {
-  getContext()->loadDialect<mlir::LLVM::LLVMDialect>();
+void fir::FIROpsDialect::initialize() {
   registerTypes();
   registerAttributes();
   addOperations<
@@ -94,11 +94,6 @@ void fir::addFIRToLLVMIRExtension(mlir::DialectRegistry &registry) {
       });
 }
 
-// anchor the class vtable to this compilation unit
-fir::FIROpsDialect::~FIROpsDialect() {
-  // do nothing
-}
-
 mlir::Type fir::FIROpsDialect::parseType(mlir::DialectAsmParser &parser) const {
   return parseFirType(const_cast<FIROpsDialect *>(this), parser);
 }

From f19d9e1617292a95b665171574630b8674d3ae1e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 7 Mar 2024 11:28:28 -0600
Subject: [PATCH 46/95] [KnownBits] Add test for computing more information for
 `lshr`/`ashr` with `exact` flag; NFC

---
 .../Analysis/ValueTracking/knownbits-shift.ll | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/knownbits-shift.ll

diff --git a/llvm/test/Analysis/ValueTracking/knownbits-shift.ll b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
new file mode 100644
index 0000000000000..3235f69b5221a
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @simplify_lshr_with_exact(i8 %x) {
+; CHECK-LABEL: @simplify_lshr_with_exact(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 6, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[SHR]], 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %shr = lshr exact i8 6, %x
+  %r = and i8 %shr, 2
+  ret i8 %r
+}
+
+define i8 @simplify_ashr_with_exact(i8 %x) {
+; CHECK-LABEL: @simplify_ashr_with_exact(
+; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i8 -122, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[SHR]], 2
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %shr = ashr exact i8 -122, %x
+  %r = and i8 %shr, 2
+  ret i8 %r
+}

From a9d913ebcd567ad14ffdc8c8684c4f0611e1e2da Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 5 Mar 2024 21:56:27 -0600
Subject: [PATCH 47/95] [KnownBits] Add API support for `exact` in
 `lshr`/`ashr`; NFC

---
 llvm/include/llvm/Support/KnownBits.h          |  4 ++--
 llvm/lib/Analysis/ValueTracking.cpp            | 14 ++++++++------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  6 ++++--
 llvm/lib/Support/KnownBits.cpp                 |  4 ++--
 4 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 46dbf0c2baa5f..06d2c90f7b0f6 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -402,12 +402,12 @@ struct KnownBits {
   /// Compute known bits for lshr(LHS, RHS).
   /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
   static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS,
-                        bool ShAmtNonZero = false);
+                        bool ShAmtNonZero = false, bool Exact = false);
 
   /// Compute known bits for ashr(LHS, RHS).
   /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
   static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS,
-                        bool ShAmtNonZero = false);
+                        bool ShAmtNonZero = false, bool Exact = false);
 
   /// Determine if these known bits always give the same ICMP_EQ result.
   static std::optional<bool> eq(const KnownBits &LHS, const KnownBits &RHS);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6d0e79e11eed4..d7f60d85b4523 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1142,9 +1142,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::LShr: {
-    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt,
-                 bool ShAmtNonZero) {
-      return KnownBits::lshr(KnownVal, KnownAmt, ShAmtNonZero);
+    bool Exact = Q.IIQ.isExact(cast<BinaryOperator>(I));
+    auto KF = [Exact](const KnownBits &KnownVal, const KnownBits &KnownAmt,
+                      bool ShAmtNonZero) {
+      return KnownBits::lshr(KnownVal, KnownAmt, ShAmtNonZero, Exact);
     };
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
                                       KF);
@@ -1155,9 +1156,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::AShr: {
-    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt,
-                 bool ShAmtNonZero) {
-      return KnownBits::ashr(KnownVal, KnownAmt, ShAmtNonZero);
+    bool Exact = Q.IIQ.isExact(cast<BinaryOperator>(I));
+    auto KF = [Exact](const KnownBits &KnownVal, const KnownBits &KnownAmt,
+                      bool ShAmtNonZero) {
+      return KnownBits::ashr(KnownVal, KnownAmt, ShAmtNonZero, Exact);
     };
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
                                       KF);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 06fe716a22db0..7a0c1c328df1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3485,7 +3485,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRL:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::lshr(Known, Known2);
+    Known = KnownBits::lshr(Known, Known2, /*ShAmtNonZero=*/false,
+                            Op->getFlags().hasExact());
 
     // Minimum shift high bits are known zero.
     if (const APInt *ShMinAmt =
@@ -3495,7 +3496,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRA:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::ashr(Known, Known2);
+    Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false,
+                            Op->getFlags().hasExact());
     break;
   case ISD::FSHL:
   case ISD::FSHR:
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 74d857457aec1..ed25e52b9ace6 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -343,7 +343,7 @@ KnownBits KnownBits::shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW,
 }
 
 KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero) {
+                          bool ShAmtNonZero, bool /*Exact*/) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
@@ -389,7 +389,7 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
 }
 
 KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero) {
+                          bool ShAmtNonZero, bool /*Exact*/) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;

From d81db0e5f5b1404ff4813af3050d671528ad45cc Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 5 Mar 2024 22:03:44 -0600
Subject: [PATCH 48/95] [KnownBits] Implement knownbits `lshr`/`ashr` with
 exact flag

The exact flag basically allows us to set an upper bound on shift
amount when we have a known 1 in `LHS`.

Typically we deduce exact using knownbits (on non-exact incoming
shifts), so this is particularly impactful, but may be useful in some
circumstances.

Closes #84254
---
 llvm/lib/Support/KnownBits.cpp                | 28 +++++++++++++++++--
 .../Analysis/ValueTracking/knownbits-shift.ll |  8 ++----
 llvm/unittests/Support/KnownBitsTest.cpp      | 26 +++++++++++++++++
 3 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index ed25e52b9ace6..c33c3680825a1 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -343,7 +343,7 @@ KnownBits KnownBits::shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW,
 }
 
 KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero, bool /*Exact*/) {
+                          bool ShAmtNonZero, bool Exact) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
@@ -367,6 +367,18 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
   // Find the common bits from all possible shifts.
   APInt MaxValue = RHS.getMaxValue();
   unsigned MaxShiftAmount = getMaxShiftAmount(MaxValue, BitWidth);
+
+  // If exact, bound MaxShiftAmount to first known 1 in LHS.
+  if (Exact) {
+    unsigned FirstOne = LHS.countMaxTrailingZeros();
+    if (FirstOne < MinShiftAmount) {
+      // Always poison. Return zero because we don't like returning conflict.
+      Known.setAllZero();
+      return Known;
+    }
+    MaxShiftAmount = std::min(MaxShiftAmount, FirstOne);
+  }
+
   unsigned ShiftAmtZeroMask = RHS.Zero.zextOrTrunc(32).getZExtValue();
   unsigned ShiftAmtOneMask = RHS.One.zextOrTrunc(32).getZExtValue();
   Known.Zero.setAllBits();
@@ -389,7 +401,7 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
 }
 
 KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero, bool /*Exact*/) {
+                          bool ShAmtNonZero, bool Exact) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
@@ -415,6 +427,18 @@ KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS,
   // Find the common bits from all possible shifts.
   APInt MaxValue = RHS.getMaxValue();
   unsigned MaxShiftAmount = getMaxShiftAmount(MaxValue, BitWidth);
+
+  // If exact, bound MaxShiftAmount to first known 1 in LHS.
+  if (Exact) {
+    unsigned FirstOne = LHS.countMaxTrailingZeros();
+    if (FirstOne < MinShiftAmount) {
+      // Always poison. Return zero because we don't like returning conflict.
+      Known.setAllZero();
+      return Known;
+    }
+    MaxShiftAmount = std::min(MaxShiftAmount, FirstOne);
+  }
+
   unsigned ShiftAmtZeroMask = RHS.Zero.zextOrTrunc(32).getZExtValue();
   unsigned ShiftAmtOneMask = RHS.One.zextOrTrunc(32).getZExtValue();
   Known.Zero.setAllBits();
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-shift.ll b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
index 3235f69b5221a..5cb355eff5a69 100644
--- a/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
+++ b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
@@ -3,9 +3,7 @@
 
 define i8 @simplify_lshr_with_exact(i8 %x) {
 ; CHECK-LABEL: @simplify_lshr_with_exact(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i8 6, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SHR]], 2
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 2
 ;
   %shr = lshr exact i8 6, %x
   %r = and i8 %shr, 2
@@ -14,9 +12,7 @@ define i8 @simplify_lshr_with_exact(i8 %x) {
 
 define i8 @simplify_ashr_with_exact(i8 %x) {
 ; CHECK-LABEL: @simplify_ashr_with_exact(
-; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i8 -122, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[SHR]], 2
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 2
 ;
   %shr = ashr exact i8 -122, %x
   %r = and i8 %shr, 2
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index 658f3796721c4..7c183e9626f98 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -516,6 +516,19 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return N1.lshr(N2);
       },
       checkOptimalityBinary, /* RefinePoisonToZero */ true);
+  testBinaryOpExhaustive(
+      [](const KnownBits &Known1, const KnownBits &Known2) {
+        return KnownBits::lshr(Known1, Known2, /*ShAmtNonZero=*/false,
+                               /*Exact=*/true);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        if (N2.uge(N2.getBitWidth()))
+          return std::nullopt;
+        if (!N1.extractBits(N2.getZExtValue(), 0).isZero())
+          return std::nullopt;
+        return N1.lshr(N2);
+      },
+      checkOptimalityBinary, /* RefinePoisonToZero */ true);
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::ashr(Known1, Known2);
@@ -526,6 +539,19 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return N1.ashr(N2);
       },
       checkOptimalityBinary, /* RefinePoisonToZero */ true);
+  testBinaryOpExhaustive(
+      [](const KnownBits &Known1, const KnownBits &Known2) {
+        return KnownBits::ashr(Known1, Known2, /*ShAmtNonZero=*/false,
+                               /*Exact=*/true);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        if (N2.uge(N2.getBitWidth()))
+          return std::nullopt;
+        if (!N1.extractBits(N2.getZExtValue(), 0).isZero())
+          return std::nullopt;
+        return N1.ashr(N2);
+      },
+      checkOptimalityBinary, /* RefinePoisonToZero */ true);
 
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {

From 65fd664daf4fb283d9a09e01f19709b38b99173a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Mon, 11 Mar 2024 14:00:03 -0700
Subject: [PATCH 49/95] Run pre-merge build with -k 0 to ensure all tests runs
 (#84828)

The -k option allows to continue the build after failures as much as
possible. This is useful here because when we run

> ninja check-llvm check-clang

we would like the clang tests to run even if there is a failure in a
llvm tests.

The downside is that a build failure in one file that would prevent from
running any test does not prevent from building more targets, wasting
build resources potentially.

Fixes #83371
---
 .ci/monolithic-linux.sh   | 2 +-
 .ci/monolithic-windows.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 1e7b2d2a36c24..fe1a9e57ff4aa 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -54,4 +54,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" ${targets}
+ninja -C -k 0 "${BUILD_DIR}" ${targets}
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 9561bf668a90c..c12e5544c1a18 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -62,4 +62,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" ${targets}
+ninja -C -k 0 "${BUILD_DIR}" ${targets}

From 31ffdb56b4df9b772d763dccabbfde542545d695 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 11 Mar 2024 21:06:03 +0000
Subject: [PATCH 50/95] [ArgPromotion] Add test case for #84807.

Test case for https://github.com/llvm/llvm-project/issues/84807,
showing a mis-compile in ArgPromotion.
---
 ...ing-and-non-aliasing-loads-with-clobber.ll | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll

diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
new file mode 100644
index 0000000000000..69385a7ea51a7
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p argpromotion -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+@f = dso_local global { i16, i64 } { i16 1, i64 0 }, align 8
+
+; Test case for https://github.com/llvm/llvm-project/issues/84807.
+
+; FIXME: Currently the loads from @callee are moved to @caller, even though
+;        the store in %then may aliases to load from %q.
+
+define i32 @caller1(i1 %c) {
+; CHECK-LABEL: define i32 @caller1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[F_VAL:%.*]] = load i16, ptr @f, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr @f, i64 8
+; CHECK-NEXT:    [[F_VAL1:%.*]] = load i64, ptr [[TMP0]], align 8
+; CHECK-NEXT:    call void @callee1(i16 [[F_VAL]], i64 [[F_VAL1]], i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee1(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee1(
+; CHECK-SAME: i16 [[Q_0_VAL:%.*]], i64 [[Q_8_VAL:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+
+  uselistorder ptr %q, { 1, 0 }
+}
+
+; Same as @caller1/callee2, but with default uselist order.
+define i32 @caller2(i1 %c) {
+; CHECK-LABEL: define i32 @caller2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee2(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee2(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee2(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee2(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+}
+
+declare void @use(i16, i64)

From 0f0f0ffc750b5d1364d20b8ecd3f070e9e816ecf Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 11 Mar 2024 21:08:26 +0000
Subject: [PATCH 51/95] [NFC] Remove unused variable after 94c988bc

---
 llvm/lib/Target/VE/VEISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 6e31c8b7c9a02..96340f603a87e 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -648,7 +648,6 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // PC-relative references to external symbols should go through $stub.
   // If so, we need to prepare GlobalBaseReg first.
   const TargetMachine &TM = DAG.getTarget();
-  const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)

From 3707c540d23a5684a1c37b0f7e41c1d8ed7f1f8a Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Mon, 11 Mar 2024 14:13:37 -0700
Subject: [PATCH 52/95] Make ValueObject::Cast work for casts from smaller to
 larger structs in the cases where this currently can work. (#84588)

The ValueObjectConstResult classes that back expression result variables
play a complicated game with where the data for their values is stored.
They try to make it appear as though they are still tied to the memory
in the target into which their value was written when the expression is
run, but they also keep a copy in the Host which they use after the
value is made (expression results are "history values" so that's how we
make sure they have "the value at the time of the expression".)

However, that means that if you ask them to cast themselves to a value
bigger than their original size, they don't have a way to get more
memory for that purpose. The same thing is true of ValueObjects backed
by DataExtractors, the data extractors don't know how to get more data
than they were made with in general.

The only place where we actually ask ValueObjects to sample outside
their captured bounds is when you do ValueObject::Cast from one
structure type to a bigger structure type. In
https://reviews.llvm.org/D153657 I handled this by just disallowing
casts from one structure value to a larger one. My reasoning at the time
was that the use case for this was to support discriminator based C
inheritance schemes, and you can't directly cast values in C, only
pointers, so this was not a natural way to handle those types. It seemed
logical that since you would have had to start with pointers in the
implementation, that's how you would write your lldb introspection code
as well.

Famous last words...

Turns out there are some heavy users of the SB API's who were relying on
this working, and this is a behavior change, so this patch makes this
work in the cases where it used to work before, while still disallowing
the cases we don't know how to support.

Note that if you had done this Cast operation before with either
expression results or value objects from data extractors, lldb would not
have returned the correct results, so the cases this patch outlaws are
ones that actually produce invalid results. So nobody should be using
Cast in these cases, or if they were, this patch will point out the bug
they hadn't yet noticed.
---
 lldb/source/Core/ValueObject.cpp              | 20 ++++--
 .../test/API/python_api/value/TestValueAPI.py | 68 ++++++++++++++++---
 lldb/test/API/python_api/value/main.c         | 15 +++-
 3 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index d813044d02ff5..f39bd07a25536 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -2744,8 +2744,19 @@ ValueObjectSP ValueObject::DoCast(const CompilerType &compiler_type) {
 
 ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) {
   // Only allow casts if the original type is equal or larger than the cast
-  // type.  We don't know how to fetch more data for all the ConstResult types,
-  // so we can't guarantee this will work:
+  // type, unless we know this is a load address.  Getting the size wrong for
+  // a host side storage could leak lldb memory, so we absolutely want to 
+  // prevent that.  We may not always get the right value, for instance if we
+  // have an expression result value that's copied into a storage location in
+  // the target may not have copied enough memory.  I'm not trying to fix that
+  // here, I'm just making Cast from a smaller to a larger possible in all the
+  // cases where that doesn't risk making a Value out of random lldb memory.
+  // You have to check the ValueObject's Value for the address types, since
+  // ValueObjects that use live addresses will tell you they fetch data from the
+  // live address, but once they are made, they actually don't.
+  // FIXME: Can we make ValueObject's with a live address fetch "more data" from
+  // the live address if it is still valid?
+
   Status error;
   CompilerType my_type = GetCompilerType();
 
@@ -2753,9 +2764,10 @@ ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) {
       = ExecutionContext(GetExecutionContextRef())
           .GetBestExecutionContextScope();
   if (compiler_type.GetByteSize(exe_scope)
-      <= GetCompilerType().GetByteSize(exe_scope)) {
+      <= GetCompilerType().GetByteSize(exe_scope) 
+      || m_value.GetValueType() == Value::ValueType::LoadAddress)
         return DoCast(compiler_type);
-  }
+
   error.SetErrorString("Can only cast to a type that is equal to or smaller "
                        "than the orignal type.");
 
diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py
index 18376f76e3c85..512100912d6fe 100644
--- a/lldb/test/API/python_api/value/TestValueAPI.py
+++ b/lldb/test/API/python_api/value/TestValueAPI.py
@@ -148,14 +148,66 @@ def test(self):
 
         # Test some other cases of the Cast API.  We allow casts from one struct type
         # to another, which is a little weird, but we don't support casting from a
-        # smaller type to a larger as we often wouldn't know how to get the extra data:
-        val_f = target.EvaluateExpression("f")
-        bad_cast = val_s.Cast(val_f.GetType())
-        self.assertFailure(
-            bad_cast.GetError(),
-            "Can only cast to a type that is equal to or smaller than the orignal type.",
-        )
-        weird_cast = val_f.Cast(val_s.GetType())
+        # smaller type to a larger when the underlying data is not in the inferior,
+        # since then we have no way to fetch the out-of-bounds values.
+        # For an expression that references a variable, or a FindVariable result,
+        # or an SBValue made from an address and a type, we can get back to the target,
+        # so those will work.  Make sure they do and get the right extra values as well.
+
+        # We're casting everything to the type of "f", so get that first:
+        f_var = frame0.FindVariable("f")
+        self.assertSuccess(f_var.error, "Got f")
+        bigger_type = f_var.GetType()
+
+        # First try a value that we got from FindVariable
+        container = frame0.FindVariable("my_container")
+        self.assertSuccess(container.error, "Found my_container")
+        fv_small = container.GetValueForExpressionPath(".data.small")
+        self.assertSuccess(fv_small.error, "Found small in my_container")
+        fv_cast = fv_small.Cast(bigger_type)
+        self.assertSuccess(fv_cast.error, "Can cast up from FindVariable")
+        child_checks = [
+            ValueCheck(name="a", value="33", type="int"),
+            ValueCheck(name="b", value="44", type="int"),
+            ValueCheck(name="c", value="55", type="int"),
+        ]
+        cast_check = ValueCheck(type=bigger_type.name, children=child_checks)
+
+        # Now try one we made with expr.  This one should fail, because expr
+        # stores the "canonical value" in host memory, and doesn't know how
+        # to augment that from the live address.
+        expr_cont = frame0.EvaluateExpression("my_container")
+        self.assertSuccess(expr_cont.error, "Got my_container by expr")
+        expr_small = expr_cont.GetValueForExpressionPath(".data.small")
+        self.assertSuccess(expr_small.error, "Got small by expr")
+        expr_cast = expr_small.Cast(bigger_type)
+        self.assertFailure(expr_cast.error, msg="Cannot cast expr result")
+
+        # Now try one we made with CreateValueFromAddress.  That will succeed
+        # because this directly tracks the inferior memory.
+        small_addr = fv_small.addr
+        self.assertTrue(small_addr.IsValid())
+        small_type = fv_small.GetType()
+        vfa_small = target.CreateValueFromAddress(
+            "small_from_addr", small_addr, small_type
+        )
+        self.assertSuccess(vfa_small.error, "Made small from address")
+        vfa_cast = vfa_small.Cast(bigger_type)
+        self.assertSuccess(vfa_cast.error, "Made a cast from vfa_small")
+        cast_check.check_value(self, vfa_cast, "Cast of ValueFromAddress succeeds")
+
+        # Next try ValueObject created from data.  They should fail as there's no
+        # way to grow the data:
+        data_small = target.CreateValueFromData(
+            "small_from_data", fv_small.data, fv_small.type
+        )
+        self.assertSuccess(data_small.error, "Made a valid object from data")
+        data_cast = data_small.Cast(bigger_type)
+        self.assertFailure(data_cast.error, msg="Cannot cast data backed SBValue")
+
+        # Now check casting from a larger type to a smaller, we can always do this,
+        # so just test one case:
+        weird_cast = f_var.Cast(val_s.GetType())
         self.assertSuccess(weird_cast.GetError(), "Can cast from a larger to a smaller")
         self.assertEqual(
             weird_cast.GetChildMemberWithName("a").GetValueAsSigned(0),
diff --git a/lldb/test/API/python_api/value/main.c b/lldb/test/API/python_api/value/main.c
index 672b0df376dc5..cdb2aa2f6147b 100644
--- a/lldb/test/API/python_api/value/main.c
+++ b/lldb/test/API/python_api/value/main.c
@@ -22,7 +22,7 @@ const char *weekdays[5] = { "Monday",
 const char **g_table[2] = { days_of_week, weekdays };
 
 typedef int MyInt;
-
+  
 struct MyStruct
 {
   int a;
@@ -36,6 +36,15 @@ struct MyBiggerStruct
   int c;
 };
 
+struct Container
+{
+  int discriminator;
+  union Data {
+    struct MyStruct small;
+    struct MyBiggerStruct big;
+  } data;
+};
+  
 int main (int argc, char const *argv[])
 {
     uint32_t uinthex = 0xE0A35F10;
@@ -43,8 +52,10 @@ int main (int argc, char const *argv[])
 
     int i;
     MyInt a = 12345;
-    struct MyStruct s = { 11, 22 };
+    struct MyStruct s = {11, 22};
     struct MyBiggerStruct f = { 33, 44, 55 };
+    struct Container my_container;
+    my_container.data.big = f;
     int *my_int_ptr = &g_my_int;
     printf("my_int_ptr points to location %p\n", my_int_ptr);
     int *fixed_int_ptr = (int*)(void*)0xAA;

From 762cbd82da4debf8b026a4eb4ade66720acf3182 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 11 Mar 2024 17:43:50 -0400
Subject: [PATCH 53/95] [libc][NFC] Do not add libc test framework and
 -fno-rtti to C tests. (#84837)

---
 .../modules/LLVMLibCCompileOptionRules.cmake      |  6 ++++--
 libc/cmake/modules/LLVMLibCTestRules.cmake        | 15 ++++++++++-----
 libc/test/include/CMakeLists.txt                  |  1 +
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 893a807b5b61c..5bc0898298ce3 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -108,7 +108,7 @@ function(_get_common_compile_options output_var flags)
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
-function(_get_common_test_compile_options output_var flags)
+function(_get_common_test_compile_options output_var c_test flags)
   _get_compile_options_from_flags(compile_flags ${flags})
 
   set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags})
@@ -122,7 +122,9 @@ function(_get_common_test_compile_options output_var flags)
       list(APPEND compile_options "-fno-exceptions")
       list(APPEND compile_options "-fno-unwind-tables")
       list(APPEND compile_options "-fno-asynchronous-unwind-tables")
-      list(APPEND compile_options "-fno-rtti")
+      if(NOT ${c_test})
+        list(APPEND compile_options "-fno-rtti")
+      endif()
     endif()
 
     if(LIBC_COMPILER_HAS_FIXED_POINT)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 0bdd72091fe85..eb6be91b55e26 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -111,7 +111,7 @@ function(create_libc_unittest fq_target_name)
 
   cmake_parse_arguments(
     "LIBC_UNITTEST"
-    "NO_RUN_POSTBUILD" # Optional arguments
+    "NO_RUN_POSTBUILD;C_TEST" # Optional arguments
     "SUITE;CXX_STANDARD" # Single value arguments
     "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;LINK_LIBRARIES;FLAGS" # Multi-value arguments
     ${ARGN}
@@ -126,11 +126,14 @@ function(create_libc_unittest fq_target_name)
   endif()
 
   get_fq_deps_list(fq_deps_list ${LIBC_UNITTEST_DEPENDS})
-  list(APPEND fq_deps_list libc.src.__support.StringUtil.error_to_string
-                           libc.test.UnitTest.ErrnoSetterMatcher)
+  if(NOT LIBC_UNITTEST_C_TEST)
+    list(APPEND fq_deps_list libc.src.__support.StringUtil.error_to_string
+                             libc.test.UnitTest.ErrnoSetterMatcher)
+  endif()
   list(REMOVE_DUPLICATES fq_deps_list)
 
-  _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_FLAGS}")
+  _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_C_TEST}"
+                                   "${LIBC_UNITTEST_FLAGS}")
   list(APPEND compile_options ${LIBC_UNITTEST_COMPILE_OPTIONS})
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -214,7 +217,9 @@ function(create_libc_unittest fq_target_name)
   )
 
   # LibcUnitTest should not depend on anything in LINK_LIBRARIES.
-  list(APPEND link_libraries LibcDeathTestExecutors.unit LibcTest.unit)
+  if(NOT LIBC_UNITTEST_C_TEST)
+    list(APPEND link_libraries LibcDeathTestExecutors.unit LibcTest.unit)
+  endif()
 
   target_link_libraries(${fq_build_target_name} PRIVATE ${link_libraries})
 
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index d76ad442d36ce..8d8dff53169f6 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -37,6 +37,7 @@ if(LLVM_LIBC_FULL_BUILD AND libc.include.stdbit IN_LIST TARGET_PUBLIC_HEADERS)
   )
   add_libc_test(
     stdbit_c_test
+    C_TEST
     UNIT_TEST_ONLY
     SUITE
       libc_include_tests

From c93c76b562784926b22a69d3f82a5032dcb4a274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 12 Mar 2024 00:03:26 +0200
Subject: [PATCH 54/95] [LLD] [COFF] Set the right alignment for
 DelayDirectoryChunk (#84697)

This makes a difference when linking executables with delay loaded
libraries for arm32; the delay loader implementation can load data from
the registry with instructions that assume alignment.

This issue does not show up when linking in MinGW mode, because a
PseudoRelocTableChunk gets injected, which also sets alignment, even if
the chunk itself is empty.
---
 lld/COFF/DLL.cpp                      |  2 +-
 lld/test/COFF/delayimports-armnt.yaml | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index d0b74ac445499..5f00eaded76d3 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -172,7 +172,7 @@ binImports(COFFLinkerContext &ctx,
 // A chunk for the delay import descriptor table etnry.
 class DelayDirectoryChunk : public NonSectionChunk {
 public:
-  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) {}
+  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) { setAlignment(4); }
 
   size_t getSize() const override {
     return sizeof(delay_import_directory_table_entry);
diff --git a/lld/test/COFF/delayimports-armnt.yaml b/lld/test/COFF/delayimports-armnt.yaml
index 7d9bc38c5c360..ea96d864ef53d 100644
--- a/lld/test/COFF/delayimports-armnt.yaml
+++ b/lld/test/COFF/delayimports-armnt.yaml
@@ -6,6 +6,7 @@
 # RUN: llvm-readobj --coff-imports %t.exe | FileCheck -check-prefix=IMPORT %s
 # RUN: llvm-readobj --coff-basereloc %t.exe | FileCheck -check-prefix=BASEREL %s
 # RUN: llvm-objdump --no-print-imm-hex -d %t.exe | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=DIR %s
 
 # IMPORT:      Format: COFF-ARM
 # IMPORT-NEXT: Arch: thumb
@@ -13,9 +14,9 @@
 # IMPORT-NEXT: DelayImport {
 # IMPORT-NEXT:   Name: library.dll
 # IMPORT-NEXT:   Attributes: 0x1
-# IMPORT-NEXT:   ModuleHandle: 0x3000
-# IMPORT-NEXT:   ImportAddressTable: 0x3008
-# IMPORT-NEXT:   ImportNameTable: 0x2040
+# IMPORT-NEXT:   ModuleHandle: 0x3008
+# IMPORT-NEXT:   ImportAddressTable: 0x3010
+# IMPORT-NEXT:   ImportNameTable: 0x2044
 # IMPORT-NEXT:   BoundDelayImportTable: 0x0
 # IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 # IMPORT-NEXT:   Import {
@@ -43,7 +44,7 @@
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: HIGHLOW
-# BASEREL-NEXT:     Address: 0x3008
+# BASEREL-NEXT:     Address: 0x3010
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: ABSOLUTE
@@ -52,20 +53,24 @@
 # BASEREL-NEXT: ]
 #
 # DISASM:    00401000 <.text>:
-# DISASM:      40100c:       f243 0c08       movw r12, #12296
+# DISASM:      40100c:       f243 0c10       movw r12, #12304
 # DISASM-NEXT:               f2c0 0c40       movt    r12, #64
 # DISASM-NEXT:               f000 b800       b.w     {{.+}} @ imm = #0
 # DISASM-NEXT:               e92d 480f       push.w  {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               f20d 0b10       addw    r11, sp, #16
 # DISASM-NEXT:               ed2d 0b10       vpush   {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               4661            mov     r1, r12
-# DISASM-NEXT:               f242 0000       movw r0, #8192
+# DISASM-NEXT:               f242 0004       movw r0, #8196
 # DISASM-NEXT:               f2c0 0040       movt    r0, #64
 # DISASM-NEXT:               f7ff ffe7       bl      0x401000 <.text>
 # DISASM-NEXT:               4684            mov     r12, r0
 # DISASM-NEXT:               ecbd 0b10       vpop    {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               e8bd 480f       pop.w   {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               4760            bx      r12
+#
+# DIR:         DelayImportDescriptorRVA: 0x2004
+# DIR-NEXT:    DelayImportDescriptorSize: 0x40
+
 
 --- !COFF
 header:
@@ -80,6 +85,14 @@ sections:
       - VirtualAddress:  0
         SymbolName:      __imp_function
         Type:            IMAGE_REL_ARM_MOV32T
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     01
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       1
+    SectionData:     02
 symbols:
   - Name:            .text
     Value:           0

From 60e562d11aeca8020de8d50ded7f0ba9e10e8843 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Mon, 11 Mar 2024 18:24:23 -0400
Subject: [PATCH 55/95] [mlir][linalg] Add unit dim folding pattern for
 tensor.pad (#84684)

Unit extent dims that are not padded by a tensor.pad can be folded away.
When folding unit extent dims of surrounding linalg ops, this increases
the chance that the iteration space of the linalg op will align with
nearby pad ops, improving fusion opportunities.
---
 .../Dialect/Linalg/Transforms/Transforms.h    |   4 +
 .../Linalg/Transforms/DropUnitDims.cpp        | 122 ++++++++++++++++++
 .../Dialect/Linalg/drop-unit-extent-dims.mlir |  87 +++++++++++++
 3 files changed, 213 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 65cf19e7a4fcd..c64ecb79c5ca5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -481,6 +481,10 @@ struct ControlDropUnitDims {
     if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
       return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
     }
+    if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) {
+      return llvm::to_vector(
+          llvm::seq<unsigned>(0, padOp.getSourceType().getRank()));
+    }
     return SmallVector<unsigned>{};
   };
 };
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 45cab81be4f5f..023ea277bcf49 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -561,6 +561,126 @@ struct DropUnitDims : public OpRewritePattern<GenericOp> {
 };
 } // namespace
 
+//===---------------------------------------------------------------------===//
+// Drop dimensions that are unit-extents within tensor operations.
+//===---------------------------------------------------------------------===//
+
+namespace {
+struct DropPadUnitDims : public OpRewritePattern<tensor::PadOp> {
+  DropPadUnitDims(MLIRContext *context, ControlDropUnitDims options = {},
+                  PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit), options(std::move(options)) {}
+
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
+                                PatternRewriter &rewriter) const override {
+    // 1a. Get the allowed list of dimensions to drop from the `options`.
+    SmallVector<unsigned> allowedUnitDims = options.controlFn(padOp);
+    if (allowedUnitDims.empty()) {
+      return rewriter.notifyMatchFailure(
+          padOp, "control function returns no allowed unit dims to prune");
+    }
+
+    if (padOp.getSourceType().getEncoding()) {
+      return rewriter.notifyMatchFailure(
+          padOp, "cannot collapse dims of tensor with encoding");
+    }
+
+    // Fail for non-constant padding values. The body of the pad could
+    // depend on the padding indices and/or properties of the padded
+    // tensor so for now we fail.
+    // TODO: Support non-constant padding values.
+    Value paddingVal = padOp.getConstantPaddingValue();
+    if (!paddingVal) {
+      return rewriter.notifyMatchFailure(
+          padOp, "unimplemented: non-constant padding value");
+    }
+
+    ArrayRef<int64_t> sourceShape = padOp.getSourceType().getShape();
+    int64_t padRank = sourceShape.size();
+
+    auto isStaticZero = [](OpFoldResult f) {
+      std::optional<int64_t> maybeInt = getConstantIntValue(f);
+      return maybeInt && *maybeInt == 0;
+    };
+
+    llvm::SmallDenseSet<unsigned> unitDimsFilter(allowedUnitDims.begin(),
+                                                 allowedUnitDims.end());
+    llvm::SmallDenseSet<unsigned> unitDims;
+    SmallVector<int64_t> newShape;
+    SmallVector<OpFoldResult> newLowPad;
+    SmallVector<OpFoldResult> newHighPad;
+    for (const auto [dim, size, low, high] :
+         zip_equal(llvm::seq(static_cast<int64_t>(0), padRank), sourceShape,
+                   padOp.getMixedLowPad(), padOp.getMixedHighPad())) {
+      if (unitDimsFilter.contains(dim) && size == 1 && isStaticZero(low) &&
+          isStaticZero(high)) {
+        unitDims.insert(dim);
+      } else {
+        newShape.push_back(size);
+        newLowPad.push_back(low);
+        newHighPad.push_back(high);
+      }
+    }
+
+    if (unitDims.empty()) {
+      return rewriter.notifyMatchFailure(padOp, "no unit dims to collapse");
+    }
+
+    ReassociationIndices reassociationGroup;
+    SmallVector<ReassociationIndices> reassociationMap;
+    int64_t dim = 0;
+    while (dim < padRank && unitDims.contains(dim))
+      reassociationGroup.push_back(dim++);
+    while (dim < padRank) {
+      assert(!unitDims.contains(dim) && "expected non unit-extent");
+      reassociationGroup.push_back(dim);
+      dim++;
+      // Fold all following dimensions that are unit-extent.
+      while (dim < padRank && unitDims.contains(dim))
+        reassociationGroup.push_back(dim++);
+      reassociationMap.push_back(reassociationGroup);
+      reassociationGroup.clear();
+    }
+
+    Value collapsedSource =
+        collapseValue(rewriter, padOp.getLoc(), padOp.getSource(), newShape,
+                      reassociationMap, options.rankReductionStrategy);
+
+    auto newPadOp = rewriter.create<tensor::PadOp>(
+        padOp.getLoc(), /*result=*/Type(), collapsedSource, newLowPad,
+        newHighPad, paddingVal, padOp.getNofold());
+
+    Value dest = padOp.getResult();
+    if (options.rankReductionStrategy ==
+        ControlDropUnitDims::RankReductionStrategy::ExtractInsertSlice) {
+      SmallVector<OpFoldResult> expandedSizes;
+      int64_t numUnitDims = 0;
+      for (auto dim : llvm::seq(static_cast<int64_t>(0), padRank)) {
+        if (unitDims.contains(dim)) {
+          expandedSizes.push_back(rewriter.getIndexAttr(1));
+          numUnitDims++;
+          continue;
+        }
+        expandedSizes.push_back(tensor::getMixedSize(
+            rewriter, padOp.getLoc(), newPadOp, dim - numUnitDims));
+      }
+      dest = rewriter.create<tensor::EmptyOp>(
+          padOp.getLoc(), expandedSizes,
+          padOp.getResultType().getElementType());
+    }
+
+    Value expandedValue =
+        expandValue(rewriter, padOp.getLoc(), newPadOp.getResult(), dest,
+                    reassociationMap, options.rankReductionStrategy);
+    rewriter.replaceOp(padOp, expandedValue);
+    return success();
+  }
+
+private:
+  ControlDropUnitDims options;
+};
+} // namespace
+
 namespace {
 /// Convert `extract_slice` operations to rank-reduced versions.
 struct RankReducedExtractSliceOp
@@ -640,6 +760,7 @@ populateFoldUnitExtentDimsViaReshapesPatterns(RewritePatternSet &patterns,
                                               ControlDropUnitDims &options) {
   auto *context = patterns.getContext();
   patterns.add<DropUnitDims>(context, options);
+  patterns.add<DropPadUnitDims>(context, options);
   // TODO: Patterns unrelated to unit dim folding should be factored out.
   patterns.add<RankReducedExtractSliceOp,
                RankReducedInsertSliceOp<tensor::InsertSliceOp>,
@@ -661,6 +782,7 @@ populateFoldUnitExtentDimsViaSlicesPatterns(RewritePatternSet &patterns,
   options.rankReductionStrategy =
       ControlDropUnitDims::RankReductionStrategy::ExtractInsertSlice;
   patterns.add<DropUnitDims>(context, options);
+  patterns.add<DropPadUnitDims>(context, options);
   // TODO: Patterns unrelated to unit dim folding should be factored out.
   linalg::FillOp::getCanonicalizationPatterns(patterns, context);
   tensor::EmptyOp::getCanonicalizationPatterns(patterns, context);
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index 0c51a032df901..f2c490b832076 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -946,3 +946,90 @@ func.func @drop_all_loops(%arg0 : memref<1x1xf32, 3>) -> memref<1x1xf32, 3>
 // CHECK-SLICES-LABEL: func @drop_all_loops
 //       CHECK-SLICES:   memref.subview %{{.*}}[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 3> to memref<f32, strided<[]>, 3>
 //       CHECK-SLICES:   linalg.generic{{.*}}memref<f32, strided<[]>, 3>
+
+// -----
+
+func.func @drop_unit_pad_dims(%arg0: tensor<1x1x3x1x1xf32>) -> tensor<1x2x3x1x3xf32>
+{
+  %c0 = arith.constant 0 : index
+  %cst0 = arith.constant 0.0 : f32
+  %0 = tensor.pad %arg0 low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index):
+      tensor.yield %cst0 : f32
+  } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+  return %0 : tensor<1x2x3x1x3xf32>
+}
+
+// CHECK-LABEL: func @drop_unit_pad_dims
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} : tensor<1x1x3x1x1xf32> into tensor<1x3x1xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0, 0] high[0, 0, 2]
+//       CHECK:   } : tensor<1x3x1xf32> to tensor<2x3x3xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
+
+// CHECK-SLICES-LABEL: func @drop_unit_pad_dims
+//       CHECK-SLICES:   %[[EXTRACT:.+]] = tensor.extract_slice
+//  CHECK-SLICES-SAME:     [0, 0, 0, 0, 0] [1, 1, 3, 1, 1] [1, 1, 1, 1, 1] : tensor<1x1x3x1x1xf32> to tensor<1x3x1xf32>
+//       CHECK-SLICES:   %[[PADDED:.+]] = tensor.pad %[[EXTRACT]] low[1, 0, 0] high[0, 0, 2]
+//       CHECK-SLICES:   } : tensor<1x3x1xf32> to tensor<2x3x3xf32>
+//       CHECK-SLICES:   tensor.insert_slice %[[PADDED]]
+//  CHECK-SLICES-SAME:     [0, 0, 0, 0, 0] [1, 2, 3, 1, 3] [1, 1, 1, 1, 1] : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
+
+// -----
+
+func.func @drop_unit_pad_dynamic_dims(%arg0: tensor<1x?xf32>) -> tensor<1x?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %cst0 = arith.constant 0.0 : f32
+  %0 = tensor.pad %arg0 low[0, 5] high[0, 6] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %cst0 : f32
+  } : tensor<1x?xf32> to tensor<1x?xf32>
+  return %0 : tensor<1x?xf32>
+}
+
+// CHECK-LABEL: func @drop_unit_pad_dynamic_dims
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<1x?xf32> into tensor<?xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[5] high[6]
+//       CHECK:   } : tensor<?xf32> to tensor<?xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<?xf32> into tensor<1x?xf32>
+
+// CHECK-SLICES: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 11)>
+
+// CHECK-SLICES-LABEL: func @drop_unit_pad_dynamic_dims
+//  CHECK-SLICES-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<1x?xf32>
+//       CHECK-SLICES:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %c1
+//       CHECK-SLICES:   %[[EXTRACT:.+]] = tensor.extract_slice
+//  CHECK-SLICES-SAME:     [0, 0] [1, %[[DIM]]] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+//       CHECK-SLICES:   %[[PADDED:.+]] = tensor.pad %[[EXTRACT]] low[5] high[6]
+//       CHECK-SLICES:   } : tensor<?xf32> to tensor<?xf32>
+//       CHECK-SLICES:   %[[PADDED_DIM:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]]
+//       CHECK-SLICES:   %[[EMPTY:.+]] = tensor.empty(%[[PADDED_DIM]]) : tensor<1x?xf32>
+//       CHECK-SLICES:   tensor.insert_slice %[[PADDED]] into %[[EMPTY]]
+//  CHECK-SLICES-SAME:     [0, 0] [1, %[[PADDED_DIM]]] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
+
+// -----
+
+func.func @do_not_drop_non_constant_padding(%arg0: tensor<1x1x3x1x1xf32>, %pad: f32) -> tensor<1x2x3x1x3xf32>
+{
+  %c0 = arith.constant 0 : index
+  %0 = tensor.pad %arg0 low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index):
+      %0 = arith.index_cast %arg3 : index to i64
+      %1 = arith.sitofp %0 : i64 to f32
+      %add = arith.addf %pad, %1 : f32
+      tensor.yield %add : f32
+  } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+  return %0 : tensor<1x2x3x1x3xf32>
+}
+
+// CHECK-LABEL: func @do_not_drop_non_constant_padding
+//       CHECK:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
+//       CHECK:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+
+// CHECK-SLICES-LABEL: func @do_not_drop_non_constant_padding
+//       CHECK-SLICES:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
+//       CHECK-SLICES:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>

From 6397f223c456ce5a0cc246cd81673794a4860fd1 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 11 Mar 2024 15:32:41 -0700
Subject: [PATCH 56/95] [clang] Fix test after #84214

---
 clang/test/CodeGen/remote-traps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGen/remote-traps.c b/clang/test/CodeGen/remote-traps.c
index f053d1bd157f8..6751afb96d25f 100644
--- a/clang/test/CodeGen/remote-traps.c
+++ b/clang/test/CodeGen/remote-traps.c
@@ -1,15 +1,15 @@
 // RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow %s -o - | FileCheck %s 
 // RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow -mllvm -clang-remove-traps -mllvm -remove-traps-random-rate=1 %s -o - | FileCheck %s --implicit-check-not="call void @llvm.ubsantrap" --check-prefixes=REMOVE
 
-int f(int x) {
+int test(int x) {
   return x + 123;
 }
 
-// CHECK-LABEL: define dso_local noundef i32 @f(
+// CHECK-LABEL: define {{.*}}i32 @test(
 // CHECK: call { i32, i1 } @llvm.sadd.with.overflow.i32(
 // CHECK: trap:
 // CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
 // CHECK-NEXT: unreachable
 
-// REMOVE-LABEL: define dso_local noundef i32 @f(
+// REMOVE-LABEL: define {{.*}}i32 @test(
 // REMOVE: call { i32, i1 } @llvm.sadd.with.overflow.i32(

From a950c06d9864ec34d401702f398dc09fbec87891 Mon Sep 17 00:00:00 2001
From: Connor Sughrue <55301806+cpsughrue@users.noreply.github.com>
Date: Mon, 11 Mar 2024 15:41:50 -0700
Subject: [PATCH 57/95] [CI] Run pre-merge build with -k 0 placed after
 "${BUILD_DIR}" (#84846)

#84828 added `-k 0` to pre-merge CI so that if one job fails the others
would continue building. This pull request fixes the location of `-k 0`
in the ninja command line.

Resolves #84842 and #83371
---
 .ci/monolithic-linux.sh   | 2 +-
 .ci/monolithic-windows.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index fe1a9e57ff4aa..9e670c447fbad 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -54,4 +54,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C -k 0 "${BUILD_DIR}" ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets}
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index c12e5544c1a18..52ba13036f915 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -62,4 +62,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C -k 0 "${BUILD_DIR}" ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets}

From 83c9244ae4bee8a494a7abe313a6e9f22ac4be55 Mon Sep 17 00:00:00 2001
From: Yinying Li <yinyingli@google.com>
Date: Mon, 11 Mar 2024 18:44:32 -0400
Subject: [PATCH 58/95] [mlir][sparse] Migrate more tests to use
 sparse_tensor.print (#84833)

Continuous efforts following #84249.
---
 .../CPU/concatenate_dim_0_permute.mlir        |  78 ++--
 .../SparseTensor/CPU/concatenate_dim_1.mlir   |  58 ++-
 .../CPU/concatenate_dim_1_permute.mlir        |  72 ++--
 .../SparseTensor/CPU/dual_sparse_conv_2d.mlir |  99 +++--
 .../Dialect/SparseTensor/CPU/reshape_dot.mlir |   4 +-
 .../SparseTensor/CPU/sparse_block3d.mlir      |  38 +-
 .../Dialect/SparseTensor/CPU/sparse_cast.mlir |   4 +-
 .../Dialect/SparseTensor/CPU/sparse_cmp.mlir  |  49 ++-
 .../SparseTensor/CPU/sparse_codegen_dim.mlir  |   4 +-
 .../CPU/sparse_codegen_foreach.mlir           |  11 +-
 .../CPU/sparse_collapse_shape.mlir            | 128 ++++---
 .../CPU/sparse_constant_to_sparse_tensor.mlir |  31 +-
 .../CPU/sparse_conv_1d_nwc_wcf.mlir           |  53 +--
 .../SparseTensor/CPU/sparse_conv_2d.mlir      | 125 ++++---
 .../SparseTensor/CPU/sparse_conv_2d_55.mlir   |   4 +-
 .../CPU/sparse_conv_2d_nchw_fchw.mlir         |   4 +-
 .../CPU/sparse_conv_2d_nhwc_hwcf.mlir         | 166 +++++----
 .../SparseTensor/CPU/sparse_conv_3d.mlir      | 344 +++++++++---------
 .../CPU/sparse_conv_3d_ndhwc_dhwcf.mlir       | 212 ++++++-----
 19 files changed, 777 insertions(+), 707 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
index 11edd854ec08a..9c9b0e3330c9c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -99,20 +99,6 @@ module {
     return
   }
 
-  func.func @dump_mat_perm_9x4(%A: tensor<9x4xf64, #MAT_C_C_P>) {
-    %c = sparse_tensor.convert %A : tensor<9x4xf64, #MAT_C_C_P> to tensor<9x4xf64>
-    %cu = tensor.cast %c : tensor<9x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<9x4xf64, #MAT_C_C_P>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_C_C_P> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_9x4(%A: tensor<9x4xf64>) {
     %u = tensor.cast %A : tensor<9x4xf64> to tensor<*xf64>
     call @printMemrefF64(%u) : (tensor<*xf64>) -> ()
@@ -120,18 +106,8 @@ module {
     return
   }
 
-  func.func @dump_mat_annotated_dense_9x4(%A: tensor<9x4xf64, #MAT_D_D>) {
-    %n = sparse_tensor.number_of_entries %A : tensor<9x4xf64, #MAT_D_D>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_D_D> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -163,20 +139,21 @@ module {
     %sm34cdp = sparse_tensor.convert %m34 : tensor<3x4xf64> to tensor<3x4xf64, #MAT_C_D_P>
     %sm44dcp = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C_P>
 
-    // CHECK:      {{\[}}[1,   0,   3,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0],
-    // CHECK-NEXT:  [1,   0,   1,   1],
-    // CHECK-NEXT:  [0,   0.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   1.5,   1],
-    // CHECK-NEXT:  [0,   3.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1,  1,  2,  0.5,  5,  3.5,  5,  0.5,  3,  1,  2,  1.5,  2,  1,  1
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 9, 4 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 11, 16, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 1, 3, 4, 6, 7, 8, 0, 2, 4, 5, 7, 2, 5
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 2, 0.5, 5, 3.5, 5, 0.5, 3, 1, 2, 1.5, 2, 1, 1
+    // CHECK-NEXT: ----
+    //
     %4 = call @concat_sparse_sparse_perm(%sm24ccp, %sm34cd, %sm44dc)
                : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C_P>
-    call @dump_mat_perm_9x4(%4) : (tensor<9x4xf64, #MAT_C_C_P>) -> ()
+    sparse_tensor.print %4 : tensor<9x4xf64, #MAT_C_C_P>
 
     // CHECK:      {{\[}}[1,   0,   3,   0],
     // CHECK-NEXT:  [0,   2,   0,   0],
@@ -191,20 +168,21 @@ module {
                : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64>
     call @dump_mat_dense_9x4(%5) : (tensor<9x4xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   3,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0],
-    // CHECK-NEXT:  [1,   0,   1,   1],
-    // CHECK-NEXT:  [0,   0.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   1.5,   1],
-    // CHECK-NEXT:  [0,   3.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  3,  2,  1,  1,  1,  0.5,  1,  5,  2,  1.5,  1,  3.5,  1,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 9, 4 )
+    // CHECK-NEXT: lvl = ( 9, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 9
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1
+    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %6 = call @concat_mix_sparse_perm(%m24, %sm34cdp, %sm44dc)
                : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C>
-    call @dump_mat_9x4(%6) : (tensor<9x4xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %6 : tensor<9x4xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   3,   0],
     // CHECK-NEXT:  [0,   2,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
index 48d3825700920..ae067bf18527b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -82,20 +82,6 @@ module {
     return %0 : tensor<4x9xf64>
   }
 
-  func.func @dump_mat_4x9(%A: tensor<4x9xf64, #MAT_C_C>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_4x9(%A: tensor<4x9xf64>) {
     %1 = tensor.cast %A : tensor<4x9xf64> to tensor<*xf64>
     call @printMemrefF64(%1) : (tensor<*xf64>) -> ()
@@ -104,7 +90,7 @@ module {
   }
 
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -125,15 +111,21 @@ module {
     %sm43cd = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D>
     %sm44dc = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C>
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %8 = call @concat_sparse_sparse_dim1(%sm42cc, %sm43cd, %sm44dc)
                : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%8) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %8 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
@@ -143,15 +135,21 @@ module {
                : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
     call @dump_mat_dense_4x9(%9) : (tensor<4x9xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %10 = call @concat_mix_sparse_dim1(%m42, %sm43cd, %sm44dc)
                : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%10) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %10 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
index dcdaa072c02fd..ce746f27c4d88 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -85,34 +85,6 @@ module {
     return %0 : tensor<4x9xf64>
   }
 
-  func.func @dump_mat_4x9(%A: tensor<4x9xf64, #MAT_C_C>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
-  func.func @dump_mat_perm_4x9(%A: tensor<4x9xf64, #MAT_C_C_P>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C_P> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C_P>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C_P> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_4x9(%A: tensor<4x9xf64>) {
     %1 = tensor.cast %A : tensor<4x9xf64> to tensor<*xf64>
     call @printMemrefF64(%1) : (tensor<*xf64>) -> ()
@@ -121,7 +93,7 @@ module {
   }
 
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -153,15 +125,21 @@ module {
     %sm43cdp = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D_P>
     %sm44dcp = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C_P>
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  3.1,  2,  1,  1,  5,  2,  1,  0.5,  1,  1,  1,  3.5,  5,  0.5,  1.5,  2,  1
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 9, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 9
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 15, 17, 18
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 3, 3, 0, 1, 2, 2, 3, 1, 2, 3, 0, 2, 0
+    // CHECK-NEXT: values : ( 1, 3.1, 2, 1, 1, 5, 2, 1, 0.5, 1, 1, 1, 3.5, 5, 0.5, 1.5, 2, 1
+    // CHECK-NEXT: ----
+    //
     %12 = call @concat_sparse_sparse_perm_dim1(%sm42ccp, %sm43cd, %sm44dc)
                : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C_P>
-    call @dump_mat_perm_4x9(%12) : (tensor<4x9xf64, #MAT_C_C_P>) -> ()
+    sparse_tensor.print %12 : tensor<4x9xf64, #MAT_C_C_P>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
@@ -171,15 +149,21 @@ module {
                : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
     call @dump_mat_dense_4x9(%13) : (tensor<4x9xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %14 = call @concat_mix_sparse_perm_dim1(%m42, %sm43cdp, %sm44dc)
                : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%14) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %14 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
index 6c35e2b51ed8f..350b5b41dafc0 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -85,7 +85,7 @@ module {
     return %0 : tensor<6x6xi32, #CSC>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
@@ -141,7 +141,6 @@ module {
        : (tensor<8x8xi32, #CSC>,
           tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC>
 
-
     // Verify the output.
     //
     // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
@@ -156,64 +155,62 @@ module {
     vector.print %v : vector<6x6xi32>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_DCSR = sparse_tensor.convert %2
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v2 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CD = sparse_tensor.convert %4
-      : tensor<6x6xi32, #CDR> to tensor<6x6xi32>
-    %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v4 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSR = sparse_tensor.convert %3
-      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
-    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v3 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSC = sparse_tensor.convert %5
-      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
-    %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v5 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
 
     // Release the resources.
     bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
index 689428c23f7d7..ebf9f4392d859 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -84,7 +84,7 @@ module {
   }
 
 
-  func.func @entry() {
+  func.func @main() {
     // Setup two sparse vectors.
     %d1 = arith.constant sparse<
         [ [0, 0], [1, 1], [2, 2], [2, 3], [4, 5] ],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
index 024e86b4f165b..2ff73923c8327 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -90,28 +90,38 @@ module {
     // ending at index (3,3,2)) with a “DCSR-flavored” along (j,k) with
     // dense “fibers” in the i-dim, we end up with 8 stored entries.
     //
-    // CHECK: 8
-    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: dim = ( 4, 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 2
+    // CHECK-NEXT: crd[0] : ( 0, 3
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2
+    // CHECK-NEXT: crd[1] : ( 0, 2
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %na = sparse_tensor.number_of_entries %a : tensor<4x4x4xi32, #Sparse1>
-    vector.print %na : index
-    %ma = sparse_tensor.values %a: tensor<4x4x4xi32, #Sparse1> to memref<?xi32>
-    %va = vector.transfer_read %ma[%c0], %i0: memref<?xi32>, vector<8xi32>
-    vector.print %va : vector<8xi32>
+    sparse_tensor.print %a : tensor<4x4x4xi32, #Sparse1>
 
     //
     // If we store full 2x2x2 3-D blocks in the original index order
     // in a compressed fashion, we end up with 4 blocks to incorporate
     // all the nonzeros, and thus 32 stored entries.
     //
-    // CHECK: 32
-    // CHECK-NEXT: ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0 )
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: dim = ( 4, 4, 4 )
+    // CHECK-NEXT: lvl = ( 2, 2, 2, 2, 2, 2 )
+    // CHECK-NEXT: pos[0] : ( 0, 2
+    // CHECK-NEXT: crd[0] : ( 0, 1
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1
+    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0
+    // CHECK-NEXT: ----
     //
-    %nb = sparse_tensor.number_of_entries %b : tensor<4x4x4xi32, #Sparse2>
-    vector.print %nb : index
-    %mb = sparse_tensor.values %b: tensor<4x4x4xi32, #Sparse2> to memref<?xi32>
-    %vb = vector.transfer_read %mb[%c0], %i0: memref<?xi32>, vector<32xi32>
-    vector.print %vb : vector<32xi32>
+    sparse_tensor.print %b : tensor<4x4x4xi32, #Sparse2>
 
     // Release the resources.
     bufferization.dealloc_tensor %a : tensor<4x4x4xi32, #Sparse1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
index 6efe7b334b984..3b5168db23c58 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -178,7 +178,7 @@ module {
   // Main driver that converts a dense tensor into a sparse tensor
   // and then calls the sparse casting kernel.
   //
-  func.func @entry() {
+  func.func @main() {
     %z = arith.constant 0 : index
     %b = arith.constant 0 : i8
     %i = arith.constant 0 : i32
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
index 035db33fb4b31..732bde55be91f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -96,7 +96,7 @@ module {
   // Main driver that constructs matrix and calls the sparse kernel to perform
   // element-wise comparison.
   //
-  func.func @entry() {
+  func.func @main() {
     %d0 = arith.constant 0 : i8
     %c0 = arith.constant 0 : index
 
@@ -124,33 +124,44 @@ module {
             : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR>
 
     //
-    // All should have the same result.
+    // All should have the same boolean values.
+    //
+    // CHECK: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 16
+    // CHECK-NEXT: dim = ( 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 11
+    // CHECK-NEXT: dim = ( 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 9, 11
+    // CHECK-NEXT: crd[1] : ( 1, 2, 3, 0, 1, 0, 1, 2, 3, 0, 1
+    // CHECK-NEXT: values : ( 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0
+    // CHECK-NEXT: ----
     //
-    // CHECK-COUNT-3: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
     %v = vector.transfer_read %all_dn_out[%c0, %c0], %d0
        : tensor<4x4xi8>, vector<4x4xi8>
     vector.print %v : vector<4x4xi8>
-
-    %lhs_sp_ret = sparse_tensor.convert %lhs_sp_out
-      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
-    %v1 = vector.transfer_read %lhs_sp_ret[%c0, %c0], %d0
-      : tensor<4x4xi8>, vector<4x4xi8>
-    vector.print %v1 : vector<4x4xi8>
-
-    %rhs_sp_ret = sparse_tensor.convert %all_sp_out
-      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
-    %v2 = vector.transfer_read %rhs_sp_ret[%c0, %c0], %d0
-      : tensor<4x4xi8>, vector<4x4xi8>
-    vector.print %v2 : vector<4x4xi8>
-
+    sparse_tensor.print %lhs_sp_out : tensor<4x4xi8, #DCSR>
+    sparse_tensor.print %all_sp_out : tensor<4x4xi8, #DCSR>
 
     bufferization.dealloc_tensor %lhs_sp : tensor<4x4xf64, #DCSR>
     bufferization.dealloc_tensor %rhs_sp : tensor<4x4xf64, #DCSR>
     bufferization.dealloc_tensor %all_dn_out : tensor<4x4xi8>
     bufferization.dealloc_tensor %lhs_sp_out : tensor<4x4xi8, #DCSR>
     bufferization.dealloc_tensor %all_sp_out : tensor<4x4xi8, #DCSR>
-    bufferization.dealloc_tensor %lhs_sp_ret : tensor<4x4xi8>
-    bufferization.dealloc_tensor %rhs_sp_ret : tensor<4x4xi8>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
index 7925759714edd..c5d002aa16391 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -38,7 +38,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
index 002a79055ce55..9deb5cd05fa3b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -144,7 +144,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     //
     // Initialize a 3-dim dense tensor.
     //
@@ -166,6 +166,7 @@ module {
     %s4 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOO>
     %s5 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOOPerm>
     %s6 = sparse_tensor.convert %src3d : tensor<7x8x9xf64>  to tensor<7x8x9xf64, #CCCPerm>
+
     // CHECK: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -173,6 +174,7 @@ module {
     // CHECK-NEXT: 6
     // CHECK-NEXT: 5
     call @foreach_print_const() : () -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -186,6 +188,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_dense(%src) : (tensor<2x2xf64>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -199,6 +202,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_1(%s1) : (tensor<2x2xf64, #Row>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -212,6 +216,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_2(%s2) : (tensor<2x2xf64, #CSR>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -225,6 +230,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_3(%s3) : (tensor<2x2xf64, #DCSC>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -238,6 +244,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_4(%s4) : (tensor<2x2xf64, #SortedCOO>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
index 2b5155464f0ee..cae599fa30ae2 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -115,7 +115,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %df = arith.constant -1.0 : f64
 
@@ -157,69 +157,95 @@ module {
     //
     // CHECK:      ( 1.1, 0, 1.3, 0, 2.1, 0, 2.3, 0, 3.1, 0, 3.3, 0 )
     // CHECK-NEXT: ( 1.1, 0, 1.3, 0, 2.1, 0, 2.3, 0, 3.1, 0, 3.3, 0 )
-    // CHECK-NEXT: ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
-    // CHECK-NEXT: ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
-
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 6
+    // CHECK-NEXT: dim = ( 12 )
+    // CHECK-NEXT: lvl = ( 12 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 6
+    // CHECK-NEXT: dim = ( 12 )
+    // CHECK-NEXT: lvl = ( 12 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
     %v0 = vector.transfer_read %collapse0[%c0], %df: tensor<12xf64>, vector<12xf64>
     vector.print %v0 : vector<12xf64>
     %v1 = vector.transfer_read %collapse1[%c0], %df: tensor<12xf64>, vector<12xf64>
     vector.print %v1 : vector<12xf64>
-    %b2 = sparse_tensor.values %collapse2 : tensor<12xf64, #SparseVector> to memref<?xf64>
-    %v2 = vector.transfer_read %b2[%c0], %df: memref<?xf64>, vector<12xf64>
-    vector.print %v2 : vector<12xf64>
-    %b3 = sparse_tensor.values %collapse3 : tensor<12xf64, #SparseVector> to memref<?xf64>
-    %v3 = vector.transfer_read %b3[%c0], %df: memref<?xf64>, vector<12xf64>
-    vector.print %v3 : vector<12xf64>
+    sparse_tensor.print %collapse2 : tensor<12xf64, #SparseVector>
+    sparse_tensor.print %collapse3 : tensor<12xf64, #SparseVector>
 
     %v4 = vector.transfer_read %collapse4[%c0, %c0], %df: tensor<6x10xf64>, vector<6x10xf64>
     vector.print %v4 : vector<6x10xf64>
     %v5 = vector.transfer_read %collapse5[%c0, %c0], %df: tensor<6x10xf64>, vector<6x10xf64>
     vector.print %v5 : vector<6x10xf64>
-    %b6 = sparse_tensor.values %collapse6 : tensor<6x10xf64, #SparseMatrix> to memref<?xf64>
-    %v6 = vector.transfer_read %b6[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v6 : vector<60xf64>
-    %b7 = sparse_tensor.values %collapse7 : tensor<6x10xf64, #SparseMatrix> to memref<?xf64>
-    %v7 = vector.transfer_read %b7[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v7 : vector<60xf64>
+    sparse_tensor.print %collapse6 : tensor<6x10xf64, #SparseMatrix>
+    sparse_tensor.print %collapse7 : tensor<6x10xf64, #SparseMatrix>
 
     %v8 = vector.transfer_read %collapse8[%c0, %c0], %df: tensor<?x?xf64>, vector<6x10xf64>
     vector.print %v8 : vector<6x10xf64>
     %v9 = vector.transfer_read %collapse9[%c0, %c0], %df: tensor<?x?xf64>, vector<6x10xf64>
     vector.print %v9 : vector<6x10xf64>
-    %b10 = sparse_tensor.values %collapse10 : tensor<?x?xf64, #SparseMatrix> to memref<?xf64>
-    %v10 = vector.transfer_read %b10[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v10 : vector<60xf64>
-    %b11 = sparse_tensor.values %collapse11 : tensor<?x?xf64, #SparseMatrix> to memref<?xf64>
-    %v11 = vector.transfer_read %b11[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v11 : vector<60xf64>
+    sparse_tensor.print %collapse10 : tensor<?x?xf64, #SparseMatrix>
+    sparse_tensor.print %collapse11 : tensor<?x?xf64, #SparseMatrix>
 
     // Release sparse resources.
     bufferization.dealloc_tensor %sm : tensor<3x4xf64, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
index b5efdcc09a390..abdbf80d0bc41 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -38,7 +38,7 @@
 // Integration tests for conversions from sparse constants to sparse tensors.
 //
 module {
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
@@ -51,20 +51,19 @@ module {
     // Convert the tensor in COO format to a sparse tensor with annotation #Tensor1.
     %ts = sparse_tensor.convert %ti : tensor<10x8xf64> to tensor<10x8xf64, #Tensor1>
 
-    // CHECK: ( 0, 1, 4, 5, 6, 9 )
-    %i0 = sparse_tensor.coordinates %ts { level = 0 : index } : tensor<10x8xf64, #Tensor1> to memref<?xindex>
-    %i0r = vector.transfer_read %i0[%c0], %c0: memref<?xindex>, vector<6xindex>
-    vector.print %i0r : vector<6xindex>
-
-    // CHECK: ( 0, 7, 2, 2, 3, 4, 6, 7 )
-    %i1 = sparse_tensor.coordinates %ts { level = 1 : index } : tensor<10x8xf64, #Tensor1> to memref<?xindex>
-    %i1r = vector.transfer_read %i1[%c0], %c0: memref<?xindex>, vector<8xindex>
-    vector.print %i1r : vector<8xindex>
-
-    // CHECK: ( 1, 2, 3, 4, 5, 6, 7, 8 )
-    %v = sparse_tensor.values %ts : tensor<10x8xf64, #Tensor1> to memref<?xf64>
-    %vr = vector.transfer_read %v[%c0], %d0: memref<?xf64>, vector<8xf64>
-    vector.print %vr : vector<8xf64>
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: dim = ( 10, 8 )
+    // CHECK-NEXT: lvl = ( 10, 8 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 4, 5, 6, 9
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 5, 7, 8
+    // CHECK-NEXT: crd[1] : ( 0, 7, 2, 2, 3, 4, 6, 7
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %ts : tensor<10x8xf64, #Tensor1>
 
     // Release the resources.
     bufferization.dealloc_tensor %ts : tensor<10x8xf64, #Tensor1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
index 16a67a1458369..612e62bd34d28 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -79,7 +79,7 @@ func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?
   return %ret : tensor<?x?x?xf32, #CDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -111,23 +111,35 @@ func.func @entry() {
       : tensor<?x?x?xf32>, vector<3x6x1xf32>
   vector.print %dense_v : vector<3x6x1xf32>
 
-  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
-  %1 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<3x6x1xf32>
-  vector.print %v1 : vector<3x6x1xf32>
-
-  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
-  %2 = sparse_tensor.convert %CDC_ret
-    : tensor<?x?x?xf32, #CDC> to tensor<?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<3x6x1xf32>
-  vector.print %v2 : vector<3x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 18
+  // CHECK-NEXT: dim = ( 3, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 18
+  // CHECK-NEXT: dim = ( 3, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
 
   // Free the resources
   bufferization.dealloc_tensor %in1D_nwc : tensor<?x?x?xf32>
@@ -140,8 +152,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CCC_ret : tensor<?x?x?xf32, #CCC>
   bufferization.dealloc_tensor %CDC_ret : tensor<?x?x?xf32, #CDC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index 41071ea700fb6..f8fb8fdf53e35 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -113,7 +113,7 @@ module {
     return %0 : tensor<6x6xi32, #CSC>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
@@ -181,82 +181,81 @@ module {
     vector.print %v : vector<6x6xi32>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %sparse_ret = sparse_tensor.convert %1
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v1 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %1 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_DCSR = sparse_tensor.convert %2
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v2 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CD = sparse_tensor.convert %4
-      : tensor<6x6xi32, #CDR> to tensor<6x6xi32>
-    %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v4 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSR = sparse_tensor.convert %3
-      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
-    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v3 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSC = sparse_tensor.convert %5
-      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
-    %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v5 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
 
     //
-    // Should be the same as dense output
+    // Should be the same as dense output.
     // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
     // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
     // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
index a7d7d1c5ed3c3..00805d198013d 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -68,7 +68,7 @@ module {
     return %0 : tensor<6x6xi32>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
index 95ce4f1bf48d5..9150e97e72481 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -82,7 +82,7 @@ func.func @conv_2d_nchw_fchw_CCCC_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1:
   return %ret : tensor<?x?x?x?xf32>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
index d0fbce7146fe5..d04311e59bafa 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -93,7 +93,7 @@ func.func @conv_2d_nhwc_hwcf_DCCD(%arg0: tensor<?x?x?x?xf32, #DCCD>, %arg1: tens
   return %ret : tensor<?x?x?x?xf32, #DCCD>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -142,77 +142,93 @@ func.func @entry() {
       : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
   vector.print %dense_v : vector<3x6x6x1xf32>
 
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %1 = sparse_tensor.convert %CCCC_ret
-    : tensor<?x?x?x?xf32, #CCCC> to tensor<?x?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v1 : vector<3x6x6x1xf32>
-
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %2 = sparse_tensor.convert %CDCD_ret
-    : tensor<?x?x?x?xf32, #CDCD> to tensor<?x?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v2 : vector<3x6x6x1xf32>
-
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %3 = sparse_tensor.convert %DCCD_ret
-    : tensor<?x?x?x?xf32, #DCCD> to tensor<?x?x?x?xf32>
-  %v3 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v3 : vector<3x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[3] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+  // CHECK-SAME:            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+  // CHECK-SAME:            40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+  // CHECK-SAME:            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+  // CHECK-SAME:            78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
+  // CHECK-SAME:            97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108
+  // CHECK-NEXT: crd[3] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCCC_ret : tensor<?x?x?x?xf32, #CCCC>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDCD_ret : tensor<?x?x?x?xf32, #CDCD>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DCCD_ret : tensor<?x?x?x?xf32, #DCCD>
 
   // Free the resources
   bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
@@ -227,9 +243,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CDCD_ret : tensor<?x?x?x?xf32, #CDCD>
   bufferization.dealloc_tensor %DCCD_ret : tensor<?x?x?x?xf32, #DCCD>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?x?xf32>
-  bufferization.dealloc_tensor %3 : tensor<?x?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
index f0a26dc46b056..5e2d1707a2495 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -96,7 +96,7 @@ func.func @conv_3d_DDC(%arg0: tensor<?x?x?xf32, #DDC>, %arg1: tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32, #DDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -166,173 +166,180 @@ func.func @entry() {
       : tensor<?x?x?xf32>, vector<6x6x6xf32>
   vector.print %dense_v : vector<6x6x6xf32>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %1 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v1 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[0] : ( 0, 6
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78,
+  // CHECK-SAME:            84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150,
+  // CHECK-SAME:            156, 162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
+  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
+  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %2 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[0] : ( 0, 6
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84,
+  // CHECK-SAME:            90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156,
+  // CHECK-SAME:            162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %3 = sparse_tensor.convert %DDC_ret
-    : tensor<?x?x?xf32, #DDC> to tensor<?x?x?xf32>
-  %v3 = vector.transfer_read %3[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
+  // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DDC_ret : tensor<?x?x?xf32, #DDC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %4 = sparse_tensor.convert %DCC_ret
-    : tensor<?x?x?xf32, #DCC> to tensor<?x?x?xf32>
-  %v4 = vector.transfer_read %3[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
+  // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DCC_ret : tensor<?x?x?xf32, #DCC>
 
   // Free the resources
   bufferization.dealloc_tensor %in3D : tensor<?x?x?xf32>
@@ -349,10 +356,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %DDC_ret : tensor<?x?x?xf32, #DDC>
   bufferization.dealloc_tensor %DCC_ret : tensor<?x?x?xf32, #DCC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %3 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %4 : tensor<?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
index 346a143692897..f68e429a3c821 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -83,7 +83,7 @@ func.func @conv_3d_ndhwc_dhwcf_CDCDC(%arg0: tensor<?x?x?x?x?xf32, #CDCDC>,
   return %ret : tensor<?x?x?x?x?xf32, #CDCDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -150,93 +150,134 @@ func.func @entry() {
       : (tensor<?x?x?x?x?xf32, #CCCCC>,
          tensor<?x?x?x?x?xf32>) -> (tensor<?x?x?x?x?xf32, #CCCCC>)
 
-  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
-  %1 = sparse_tensor.convert %CCCCC_ret
-    : tensor<?x?x?x?x?xf32, #CCCCC> to tensor<?x?x?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
-  vector.print %v1 : vector<1x6x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 1
+  // CHECK-NEXT: crd[0] : ( 0
+  // CHECK-NEXT: pos[1] : ( 0, 6
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[3] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96,
+  // CHECK-SAME:            102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162, 168, 174,
+  // CHECK-SAME:            180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[3] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+  // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+  // CHECK-SAME:            53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+  // CHECK-SAME:            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+  // CHECK-SAME:            87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+  // CHECK-SAME:            103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+  // CHECK-SAME:            117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+  // CHECK-SAME:            131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+  // CHECK-SAME:            145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+  // CHECK-SAME:            159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+  // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
+  // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
+  // CHECK-SAME:            215, 216
+  // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCCCC_ret : tensor<?x?x?x?x?xf32, #CCCCC>
 
   %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc)
       : (tensor<?x?x?x?x?xf32, #CDCDC>,
          tensor<?x?x?x?x?xf32>) -> (tensor<?x?x?x?x?xf32, #CDCDC>)
 
-  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
-  %2 = sparse_tensor.convert %CDCDC_ret
-    : tensor<?x?x?x?x?xf32, #CDCDC> to tensor<?x?x?x?x?xf32>
-  %v2 = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
-  vector.print %v2 : vector<1x6x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 1
+  // CHECK-NEXT: crd[0] : ( 0
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+  // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+  // CHECK-SAME:            53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+  // CHECK-SAME:            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+  // CHECK-SAME:            87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+  // CHECK-SAME:            103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+  // CHECK-SAME:            117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+  // CHECK-SAME:            131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+  // CHECK-SAME:            145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+  // CHECK-SAME:            159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+  // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
+  // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
+  // CHECK-SAME:            215, 216
+  // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDCDC_ret : tensor<?x?x?x?x?xf32, #CDCDC>
 
   // Free the resources
   bufferization.dealloc_tensor %in3D_ndhwc : tensor<?x?x?x?x?xf32>
@@ -249,8 +290,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CCCCC_ret : tensor<?x?x?x?x?xf32, #CCCCC>
   bufferization.dealloc_tensor %CDCDC_ret : tensor<?x?x?x?x?xf32, #CDCDC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?x?x?xf32>
-
   return
 }

From ad23127222fe23e28ac3deaa16f3ae64d13b7b6f Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Tue, 12 Mar 2024 06:49:09 +0800
Subject: [PATCH 59/95] [mlir][inline] avoid inline self-recursive function
 (#83092)

---
 mlir/lib/Transforms/Utils/Inliner.cpp         |  8 +++++++
 .../Transforms/inlining-recursive-self.mlir   | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 mlir/test/Transforms/inlining-recursive-self.mlir

diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 74776a73db9aa..f227cedb269d8 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 
@@ -711,6 +712,13 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) {
   if (resolvedCall.call->hasTrait<OpTrait::IsTerminator>())
     return false;
 
+  // Don't allow inlining if the target is a self-recursive function.
+  if (llvm::count_if(*resolvedCall.targetNode,
+                     [&](CallGraphNode::Edge const &edge) -> bool {
+                       return edge.getTarget() == resolvedCall.targetNode;
+                     }) > 0)
+    return false;
+
   // Don't allow inlining if the target is an ancestor of the call. This
   // prevents inlining recursively.
   Region *callableRegion = resolvedCall.targetNode->getCallableRegion();
diff --git a/mlir/test/Transforms/inlining-recursive-self.mlir b/mlir/test/Transforms/inlining-recursive-self.mlir
new file mode 100644
index 0000000000000..5cc922db8e978
--- /dev/null
+++ b/mlir/test/Transforms/inlining-recursive-self.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-opt %s -inline='default-pipeline=''' | FileCheck %s
+// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=''' | FileCheck %s
+
+// CHECK-LABEL: func.func @b0
+func.func @b0() {
+  // CHECK:         call @b0
+  // CHECK-NEXT:    call @b1
+  // CHECK-NEXT:    call @b0
+  // CHECK-NEXT:    call @b1
+  // CHECK-NEXT:    call @b0
+  func.call @b0() : () -> ()
+  func.call @b1() : () -> ()
+  func.call @b0() : () -> ()
+  func.call @b1() : () -> ()
+  func.call @b0() : () -> ()
+  return
+}
+func.func @b1() {
+  func.call @b1() : () -> ()
+  func.call @b1() : () -> ()
+  return
+}

From 8d220d109d28dac352c563ab062fb72132b7eca1 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 11 Mar 2024 16:03:32 -0700
Subject: [PATCH 60/95] workflows: Fix incorrect input name in
 release-binaries.yml (#84604)

In aa02002491333c42060373bc84f1ff5d2c76b4ce the input name was changed
from tag to release-version, but the code was never updated.
---
 .github/workflows/release-binaries.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 1dba91746dae5..131ad3004f457 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -71,8 +71,8 @@ jobs:
       # | X.Y.Z     | -final
       run: |
         tag="${{ github.ref_name }}"
-        trimmed=$(echo ${{ inputs.tag }} | xargs)
-        [[ "$trimmed" != "" ]] && tag="$trimmed"
+        trimmed=$(echo ${{ inputs.release-version }} | xargs)
+        [[ "$trimmed" != "" ]] && tag="llvmorg-$trimmed"
         if [ "$tag" = "main" ]; then
           # If tag is main, then we've been triggered by a scheduled so pass so
           # use the head commit as the tag.

From d125d5576ec85eb2517ced0fe4b68da7b8209d0c Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Mon, 11 Mar 2024 16:04:44 -0700
Subject: [PATCH 61/95] github-automation.py: Set maintainer_can_modify=True
 for backport PRs (#84819)

This makes it possible to rebase the branch using the Web UI, which
makes it easier to manually merge the PRs. Manual merge is required when
squash merge won't preserve author information of the backport.
---
 llvm/utils/git/github-automation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index b2e6843eb9af1..b21f14eca4450 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -586,7 +586,7 @@ def create_pull_request(
                 body=body,
                 base=release_branch_for_issue,
                 head=head,
-                maintainer_can_modify=False,
+                maintainer_can_modify=True,
             )
 
             pull.as_issue().edit(milestone=self.issue.milestone)

From 75790dd2d0cff5b0c3e543e256f6c8f0fb5d0689 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Mon, 11 Mar 2024 16:05:29 -0700
Subject: [PATCH 62/95] [RemoveDIs] Fix nullptr dereference in
 getFirstNonPHIIt() (#84595)

getFirstNonPHI() returns nullptr for blocks that lack a non-phi
(including a terminator) but getFirstNonPHIIt() may dereference its
result unconditionally. Return end() instead.

This came up for us downstream while correcting our getFirstNonPHI()
calls that intended to return the position after the phi's but before
the debug info to getFirstNonPHIIt(). The pass in question is populating
new BB's and hasn't added terminators yet.
---
 llvm/lib/IR/BasicBlock.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 25aa326116451..c188d2f912d16 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -348,6 +348,8 @@ const Instruction* BasicBlock::getFirstNonPHI() const {
 
 BasicBlock::const_iterator BasicBlock::getFirstNonPHIIt() const {
   const Instruction *I = getFirstNonPHI();
+  if (!I)
+    return end();
   BasicBlock::const_iterator It = I->getIterator();
   // Set the head-inclusive bit to indicate that this iterator includes
   // any debug-info at the start of the block. This is a no-op unless the

From 9688a6dae4de16e79ba677846df32099ec012627 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Mon, 11 Mar 2024 23:07:49 +0000
Subject: [PATCH 63/95] [MLIR] Add missing MLIRFuncDialect dep to
 MLIRNVVMToLLVM (#84548)

This fixes the following failure when doing a clean build (in particular
no .ninja* lying around) of lib/libMLIRNVVMToLLVM.a only:
```
In file included from mlir/lib/Conversion/NVVMToLLVM/NVVMToLLVM.cpp:18:
mlir/include/mlir/Dialect/Func/IR/FuncOps.h:29:10: fatal error: mlir/Dialect/Func/IR/FuncOps.h.inc: No such file or directory
```
---
 mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt b/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
index 2afff1a4e5f16..23174d1128719 100644
--- a/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRNVVMToLLVM
   Core
 
   LINK_LIBS PUBLIC
+  MLIRFuncDialect
   MLIRGPUDialect
   MLIRLLVMCommonConversion
   MLIRLLVMDialect

From 36cf982d6cddaa65da24fcb853295a99a9154a53 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Mon, 11 Mar 2024 23:08:56 +0000
Subject: [PATCH 64/95] [MLIR] Add missing MLIRFuncDialect dep to
 MLIRAMDGPUTransforms (#84550)

This fixes the following failure when doing a clean build (in particular
no .ninja* lying around) of lib/libMLIRAMDGPUTransforms.a only:
```
In file included from mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp:21:
mlir/include/mlir/Dialect/Func/IR/FuncOps.h:29:10: fatal error: mlir/Dialect/Func/IR/FuncOps.h.inc: No such file or directory
```
---
 mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 2274656e84a5c..a955d585b9a1d 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms
   MLIRAMDGPUUtils
   MLIRArithDialect
   MLIRControlFlowDialect
+  MLIRFuncDialect
   MLIRIR
   MLIRPass
   MLIRTransforms

From b2ea04673b782f95ac9841f87df8bb5f7b561067 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Mon, 11 Mar 2024 23:10:26 +0000
Subject: [PATCH 65/95] [MLIR] Add missing omp_gen dep to MLIROpenMPDialect
 (#84552)

This fixes the following failure when doing a clean build (in particular
no .ninja* lying around) of lib/libMLIROpenMPDialect.a only:
```
In file included from mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp:29:
llvm/include/llvm/Frontend/OpenMP/OMPConstants.h:20:10: fatal error: llvm/Frontend/OpenMP/OMP.h.inc: No such file or directory
```
---
 mlir/lib/Dialect/OpenMP/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 40b4837484a13..57a6d3445c151 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_dialect_library(MLIROpenMPDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
 
   DEPENDS
+  omp_gen
   MLIROpenMPOpsIncGen
   MLIROpenMPOpsInterfacesIncGen
   MLIROpenMPTypeInterfacesIncGen

From 8d61f82bd3676bc541edfad1014e3ed599cc1390 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 11 Mar 2024 17:31:38 -0700
Subject: [PATCH 66/95] [lld][RISCV] Avoid second map lookup in mergeArch. NFC
 (#84687)

Instead of using find and then inserting into the map, we can use
insert and fix up the version using the iterator if the insert fails.
---
 lld/ELF/Arch/RISCV.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 4798c86f7d1b6..20de1b9b7bde9 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1074,12 +1074,12 @@ static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
     mergedXlen = info.getXLen();
   } else {
     for (const auto &ext : info.getExtensions()) {
-      if (auto it = mergedExts.find(ext.first); it != mergedExts.end()) {
-        if (std::tie(it->second.Major, it->second.Minor) >=
+      auto p = mergedExts.insert(ext);
+      if (!p.second) {
+        if (std::tie(p.first->second.Major, p.first->second.Minor) <
             std::tie(ext.second.Major, ext.second.Minor))
-          continue;
+          p.first->second = ext.second;
       }
-      mergedExts[ext.first] = ext.second;
     }
   }
 }

From 67ef4ae2c3cc4e2700e873aa6f251b70a09c3fea Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Mon, 11 Mar 2024 18:11:58 -0700
Subject: [PATCH 67/95] [MLIR][Tensor,MemRef] Fold expand_shape and
 collapse_shape if identity (#80658)

Before: op verifiers failed if the input and output ranks were the same
(i.e. no expansion or collapse). This behavior requires users of these
shape ops to verify manually that they are not creating identity
versions of these ops every time they build them -- problematic. This PR
removes this strict verification, and introduces folders for the the
identity cases.

The PR also removes the special case handling of rank-0 tensors for
expand_shape and collapse_shape, there doesn't seem to be any reason to
treat them differently.
---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       |  2 +-
 .../mlir/Dialect/Tensor/IR/TensorOps.td       | 39 +++++------
 .../mlir/Dialect/Utils/ReshapeOpsUtils.h      | 64 ++++++++-----------
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      | 20 ++++--
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      | 12 ----
 mlir/test/Dialect/MemRef/canonicalize.mlir    | 29 +++++++++
 mlir/test/Dialect/MemRef/invalid.mlir         | 42 ++++++++----
 mlir/test/Dialect/Tensor/canonicalize.mlir    | 39 ++++++++++-
 mlir/test/Dialect/Tensor/invalid.mlir         | 14 ----
 9 files changed, 152 insertions(+), 109 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index c71517666b609..39e66cd9e6e5a 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -641,7 +641,7 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
   let summary = "non-blocking DMA operation that starts a transfer";
   let description = [{
     Syntax:
-    
+
     ```
     operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
                    ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 1c61ece2676a9..670202fe4372e 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1098,21 +1098,18 @@ class Tensor_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
 def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
   let summary = "operation to produce a tensor with a higher rank";
   let description = [{
-    The `tensor.expand_shape` op produces a new tensor with a higher
-    rank whose sizes are a reassociation of the original `src`.
+    The `tensor.expand_shape` op produces a tensor of higher (or equal)
+    rank than the operand `src` whose dimension sizes are a reassociation of
+    `src`.
 
-    A reassociation is defined as a continuous grouping of dimensions and is
-    represented with an array of DenseI64ArrayAttr attribute.
-
-    The verification rule is that the reassociation maps are applied to the
-    result tensor with the higher rank to obtain the operand tensor with the
-    smaller rank.
+    A reassociation is defined as a continuous grouping of dimensions. It is
+    represented with an array of DenseI64ArrayAttr attribute. Entries in the
+    array are referred to as reassociation maps.
 
-    The operand tensor type of a reshape can be zero-ranked if the result
-    tensor type is statically shaped with all dimensions being unit extent. In
-    such cases the reassociation map is empty.
+    The reassociation maps are applied to the result shape to obtain the operand
+    shape.
 
-    Examples:
+    Example:
 
     ```mlir
     // Dimension expansion i -> (i', j') and (k) -> (k')
@@ -1150,21 +1147,15 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
 def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
   let summary = "operation to produce a tensor with a smaller rank";
   let description = [{
-    The `tensor.collapse_shape` op produces a new tensor with a smaller
-    rank whose sizes are a reassociation of the original `src`.
+    The `tensor.collapse_shape` op produces a new tensor of lower (or equal)
+    rank whose dimension sizes are a reassociation of the original `src` dimensions.
 
     A reassociation is defined as a continuous grouping of dimensions and is
-    represented with an array of DenseI64ArrayAttr attribute.
+    represented by an array of DenseI64ArrayAttr attribute. The reassociation
+    maps are applied to the operand shape to obtain the result shape.
 
-    The verification rule is that the reassociation maps are applied to the
-    operand tensor with the higher rank to obtain the result tensor with the
-    smaller rank.
 
-    The result tensor type of a reshape can be zero-ranked if the operand
-    tensor type is statically shaped with all dimensions being unit extent. In
-    such case the reassociation map is empty.
-
-    Examples:
+    Example:
 
     ```mlir
     // Dimension collapse (i, j) -> i' and k -> k'
@@ -1841,7 +1832,7 @@ def Tensor_PackOp : Tensor_RelayoutOp<"pack", [
     and optionally transposes the tiled source tensor dimensions.
 
     `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are
-    being tiled, where `0 < k <= n`. The order of the dimensions matters: 
+    being tiled, where `0 < k <= n`. The order of the dimensions matters:
      - The tiled dimensions (of size `inner_tiles`) are added to the end of the result
     tensor in the order in which they appear in `inner_dims_pos`.
      - `inner_dims_pos[i]` specifies the source tensor dimension tiled by
diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index 61c929dee0f27..ae9824f728da4 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -85,16 +85,21 @@ bool isReassociationValid(ArrayRef<AffineMap> reassociation,
 template <typename ReshapeOpTy, typename InverseReshapeOpTy>
 static OpFoldResult foldReshapeOp(ReshapeOpTy reshapeOp,
                                   ArrayRef<Attribute> operands) {
-  // Fold producer-consumer reshape ops that where the operand type of the
+
+  if (reshapeOp.getSrcType() == reshapeOp.getType())
+    return reshapeOp.getSrc();
+
+  // Fold producer-consumer reshape ops where the operand type of the
   // producer is same as the return type of the consumer.
   auto reshapeSrcOp =
       reshapeOp.getSrc().template getDefiningOp<InverseReshapeOpTy>();
   if (reshapeSrcOp && reshapeSrcOp.getSrcType() == reshapeOp.getResultType())
     return reshapeSrcOp.getSrc();
+
   // Reshape of a constant can be replaced with a new constant.
-  if (auto elements = dyn_cast_or_null<DenseElementsAttr>(operands.front())) {
+  if (auto elements = dyn_cast_or_null<DenseElementsAttr>(operands.front()))
     return elements.reshape(cast<ShapedType>(reshapeOp.getResult().getType()));
-  }
+
   return nullptr;
 }
 
@@ -103,41 +108,36 @@ static OpFoldResult foldReshapeOp(ReshapeOpTy reshapeOp,
 template <typename Op, typename T>
 static LogicalResult verifyReshapeLikeTypes(Op op, T expandedType,
                                             T collapsedType, bool isExpansion) {
+
   unsigned expandedRank = expandedType.getRank();
   unsigned collapsedRank = collapsedType.getRank();
   if (expandedRank < collapsedRank)
-    return op.emitOpError("expected the type ")
-           << expandedType
-           << " to have higher rank than the type = " << collapsedType;
-  if (expandedRank == 0)
-    return op.emitOpError("expected non-zero memref ranks");
-  if (expandedRank == collapsedRank)
-    return op.emitOpError("expected to collapse or expand dims");
-
-  if (collapsedRank == 0) {
-    // If collapsed rank is 0, then expanded type must be static shaped and of
-    // sizes 1.
-    if (llvm::any_of(expandedType.getShape(),
-                     [](int64_t dim) -> bool { return dim != 1; }))
-      return op.emitOpError("invalid to reshape tensor/memref with non-unit "
-                            "extent dimensions to zero-rank tensor/memref");
-    return success();
-  }
+    return op.emitOpError("expected the expanded type, ")
+           << expandedType << " to have a higher (or same) rank "
+           << "than the collapsed type, " << collapsedType << '.';
+
   if (collapsedRank != op.getReassociation().size())
-    return op.emitOpError("expected rank of the collapsed type(")
-           << collapsedRank << ") to be the number of reassociation maps("
-           << op.getReassociation().size() << ")";
+    return op.emitOpError("expected collapsed rank (")
+           << collapsedRank << ") to equal the number of reassociation maps ("
+           << op.getReassociation().size() << ").";
+
   auto maps = op.getReassociationMaps();
   for (auto it : llvm::enumerate(maps))
     if (it.value().getNumDims() != expandedRank)
       return op.emitOpError("expected reassociation map #")
-             << it.index() << " of same rank as expanded memref("
-             << expandedRank << "), but got " << it.value().getNumDims();
+             << it.index() << " to have size equal to the expanded rank ("
+             << expandedRank << "), but it is  " << it.value().getNumDims()
+             << '.';
+
   int invalidIdx = 0;
   if (!isReassociationValid(maps, &invalidIdx))
     return op.emitOpError("expected reassociation map #")
-           << invalidIdx << " to be valid and contiguous";
-  return verifyReshapeLikeShapes(op, collapsedType, expandedType, isExpansion);
+           << invalidIdx << " to be valid and contiguous.";
+
+  return reshapeLikeShapesAreCompatible(
+      [&](const Twine &msg) { return op->emitOpError(msg); },
+      collapsedType.getShape(), expandedType.getShape(),
+      op.getReassociationIndices(), isExpansion);
 }
 
 /// Verify that shapes of the reshaped types using following rules
@@ -153,16 +153,6 @@ LogicalResult reshapeLikeShapesAreCompatible(
     ArrayRef<int64_t> collapsedShape, ArrayRef<int64_t> expandedShape,
     ArrayRef<ReassociationIndices> reassociationMaps, bool isExpandingReshape);
 
-template <typename OpTy>
-static LogicalResult verifyReshapeLikeShapes(OpTy op, ShapedType collapsedType,
-                                             ShapedType expandedType,
-                                             bool isExpandingReshape) {
-  return reshapeLikeShapesAreCompatible(
-      [&](const Twine &msg) { return op->emitOpError(msg); },
-      collapsedType.getShape(), expandedType.getShape(),
-      op.getReassociationIndices(), isExpandingReshape);
-}
-
 /// Returns true iff the type is a MemRefType and has a non-identity layout.
 bool hasNonIdentityLayout(Type type);
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 248193481acfc..94e0ed319cae8 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -2224,9 +2224,13 @@ LogicalResult ExpandShapeOp::verify() {
   MemRefType srcType = getSrcType();
   MemRefType resultType = getResultType();
 
-  if (srcType.getRank() >= resultType.getRank())
-    return emitOpError("expected rank expansion, but found source rank ")
-           << srcType.getRank() << " >= result rank " << resultType.getRank();
+  if (srcType.getRank() > resultType.getRank()) {
+    auto r0 = srcType.getRank();
+    auto r1 = resultType.getRank();
+    return emitOpError("has source rank ")
+           << r0 << " and result rank " << r1 << ". This is not an expansion ("
+           << r0 << " > " << r1 << ").";
+  }
 
   // Verify result shape.
   if (failed(verifyCollapsedShape(getOperation(), srcType.getShape(),
@@ -2378,9 +2382,13 @@ LogicalResult CollapseShapeOp::verify() {
   MemRefType srcType = getSrcType();
   MemRefType resultType = getResultType();
 
-  if (srcType.getRank() <= resultType.getRank())
-    return emitOpError("expected rank reduction, but found source rank ")
-           << srcType.getRank() << " <= result rank " << resultType.getRank();
+  if (srcType.getRank() < resultType.getRank()) {
+    auto r0 = srcType.getRank();
+    auto r1 = resultType.getRank();
+    return emitOpError("has source rank ")
+           << r0 << " and result rank " << r1 << ". This is not a collapse ("
+           << r0 << " < " << r1 << ").";
+  }
 
   // Verify result shape.
   if (failed(verifyCollapsedShape(getOperation(), resultType.getShape(),
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index fe2f250e6b929..a854da466c313 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1656,22 +1656,10 @@ static LogicalResult verifyTensorReshapeOp(TensorReshapeOp op,
 }
 
 LogicalResult ExpandShapeOp::verify() {
-  auto srcType = getSrcType();
-  auto resultType = getResultType();
-  if (srcType.getRank() >= resultType.getRank())
-    return emitOpError("expected rank expansion, but found source rank ")
-           << srcType.getRank() << " >= result rank " << resultType.getRank();
-
   return verifyTensorReshapeOp(*this, getResultType(), getSrcType());
 }
 
 LogicalResult CollapseShapeOp::verify() {
-  auto srcType = getSrcType();
-  auto resultType = getResultType();
-  if (srcType.getRank() <= resultType.getRank())
-    return emitOpError("expected rank reduction, but found source rank ")
-           << srcType.getRank() << " <= result rank " << resultType.getRank();
-
   return verifyTensorReshapeOp(*this, getSrcType(), getResultType());
 }
 
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index a772a25da5738..b1e92e54d561d 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -1,5 +1,34 @@
 // RUN: mlir-opt %s -canonicalize="test-convergence" --split-input-file -allow-unregistered-dialect | FileCheck %s
 
+
+// CHECK-LABEL: collapse_shape_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_identity_fold(%arg0 : memref<5xi8>) -> memref<5xi8> {
+  %0 = memref.collapse_shape %arg0 [[0]] : memref<5xi8> into memref<5xi8>
+  return %0 : memref<5xi8>
+}
+
+// -----
+
+// CHECK-LABEL: expand_shape_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_identity_fold(%arg0 : memref<5x4xi8>) -> memref<5x4xi8> {
+  %0 = memref.expand_shape %arg0 [[0], [1]] : memref<5x4xi8> into memref<5x4xi8>
+  return %0 : memref<5x4xi8>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_expand_rank0_cancel
+// CHECK-NEXT: return
+func.func @collapse_expand_rank0_cancel(%arg0 : memref<1x1xi8>) -> memref<1x1xi8> {
+  %0 = memref.collapse_shape %arg0 [] : memref<1x1xi8> into memref<i8>
+  %1 = memref.expand_shape %0 [] : memref<i8> into memref<1x1xi8>
+  return %1 : memref<1x1xi8>
+}
+
+// -----
+
 // CHECK-LABEL: func @subview_of_size_memcast
 //  CHECK-SAME:   %[[ARG0:.[a-z0-9A-Z_]+]]: memref<4x6x16x32xi8>
 //       CHECK:   %[[S:.+]] = memref.subview %[[ARG0]][0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : memref<4x6x16x32xi8> to memref<16x32xi8, strided{{.*}}>
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 8f5ba5ea8fc78..1aef417549d9a 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -415,20 +415,6 @@ func.func @collapse_shape_out_of_bounds(%arg0: memref<?x?xf32>) {
 
 // -----
 
-func.func @expand_shape_invalid_ranks(%arg0: memref<?x?xf32>) {
-  // expected-error @+1 {{op expected rank expansion, but found source rank 2 >= result rank 2}}
-  %0 = memref.expand_shape %arg0 [[0], [1]] : memref<?x?xf32> into memref<?x?xf32>
-}
-
-// -----
-
-func.func @collapse_shape_invalid_ranks(%arg0: memref<?x?xf32>) {
-  // expected-error @+1 {{op expected rank reduction, but found source rank 2 <= result rank 2}}
-  %0 = memref.collapse_shape %arg0 [[0], [1]] : memref<?x?xf32> into memref<?x?xf32>
-}
-
-// -----
-
 func.func @expand_shape_out_of_bounds(%arg0: memref<?xf32>) {
   // expected-error @+1 {{op reassociation index 2 is out of bounds}}
   %0 = memref.expand_shape %arg0 [[0, 1, 2]] : memref<?xf32> into memref<4x?xf32>
@@ -462,6 +448,34 @@ func.func @collapse_shape_invalid_reassociation(%arg0: memref<?x?x?xf32>) {
 
 // -----
 
+// An (invalid) attempt at using collapse_shape to increase the rank might look
+// like this. Verify that a sensible error is emitted in this case.
+func.func @collapse_shape_invalid_reassociation_expansion(%arg0: memref<?xf32>) {
+  // expected-error @+1 {{'memref.collapse_shape' op has source rank 1 and result rank 2. This is not a collapse (1 < 2)}}
+  %0 = memref.collapse_shape %arg0 [[0], [0]] :
+    memref<?xf32> into memref<?x?xf32>
+}
+
+// -----
+
+// An (invalid) attempt at using expand_shape to reduce the rank might look
+// like this. Verify that a sensible error is emitted in this case.
+func.func @expand_shape_invalid_reassociation(%arg0: memref<2x3x1xf32>) {
+  // expected-error @+1 {{'memref.expand_shape' op has source rank 3 and result rank 2. This is not an expansion (3 > 2)}}
+  %0 = memref.expand_shape %arg0 [[0], [1], [1]] :
+    memref<2x3x1xf32> into memref<2x3xf32>
+}
+
+// -----
+
+func.func @collapse_shape_invalid_reassociation_expansion(%arg0: memref<?x?xf32>) {
+  // expected-error @+1 {{reassociation indices must be contiguous}}
+  %0 = memref.collapse_shape %arg0 [[1], [0]] :
+    memref<?x?xf32> into memref<?x?xf32>
+}
+
+// -----
+
 func.func @collapse_shape_reshaping_non_contiguous(
     %arg0: memref<3x4x5xf32, strided<[270, 50, 10], offset: 0>>) {
   // expected-error @+1 {{invalid source layout map or collapsing non-contiguous dims}}
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index d17c23adfb14d..70f5d61bd802f 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1,5 +1,42 @@
 // RUN: mlir-opt %s -split-input-file -canonicalize="test-convergence" | FileCheck %s
 
+
+// CHECK-LABEL: expand_shape_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_identity_fold(%arg0 : tensor<5xf32>) -> tensor<5xf32> {
+  %0 = tensor.expand_shape %arg0 [[0]] : tensor<5xf32> into tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: expand_shape_rank0_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_rank0_identity_fold(%arg0 : tensor<f32>) -> tensor<f32> {
+  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_shape_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_identity_fold(%arg0 : tensor<5x4xf32>) -> tensor<5x4xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0], [1]] : tensor<5x4xf32> into tensor<5x4xf32>
+  return %0 : tensor<5x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_shape_rank0_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_rank0_identity_fold(%arg0 : tensor<f32>) -> tensor<f32> {
+  %0 = tensor.collapse_shape %arg0 [] : tensor<f32> into tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
 // CHECK-LABEL: @tensor_bitcast_chain_ok
 // CHECK-SAME: %[[IN:.*]]: tensor<2xi32>
 func.func @tensor_bitcast_chain_ok(%input: tensor<2xi32>) -> tensor<2xf32> {
@@ -2092,7 +2129,7 @@ func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> {
 
 // Chain: NC -> NCnc -> NCnc -> NC
 // CHECK: func.func @unpack_pack(
-// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, 
+// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>,
 // CHECK: return %[[T]] : tensor<128x128xf32>
 func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> {
   %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32>
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index 4c534fe936e3d..79ca0de68a1e9 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -343,20 +343,6 @@ func.func @illegal_collapsing_reshape_mixed_tensor_2(%arg0 : tensor<?x4x5xf32>)
 
 // -----
 
-func.func @expand_shape_invalid_ranks(%arg0: tensor<?x?xf32>) {
-  // expected-error @+1 {{op expected rank expansion, but found source rank 2 >= result rank 2}}
-  %0 = tensor.expand_shape %arg0 [[0], [1]] : tensor<?x?xf32> into tensor<?x?xf32>
-}
-
-// -----
-
-func.func @collapse_shape_invalid_ranks(%arg0: tensor<?x?xf32>) {
-  // expected-error @+1 {{op expected rank reduction, but found source rank 2 <= result rank 2}}
-  %0 = tensor.collapse_shape %arg0 [[0], [1]] : tensor<?x?xf32> into tensor<?x?xf32>
-}
-
-// -----
-
 func.func @rank(%0: f32) {
   // expected-error@+1 {{'tensor.rank' op operand #0 must be tensor of any type values}}
   "tensor.rank"(%0): (f32)->index

From 672fc89347b831f2845e7825affc30c865758270 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 11 Mar 2024 18:18:49 -0700
Subject: [PATCH 68/95] [NFC] [hwasan] factor out selective instrumentation
 logic (#84408)

sanitizeFunction is long enough already.
---
 .../Instrumentation/HWAddressSanitizer.cpp    | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 422406e46bdb0..11a5c29c35f70 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -317,6 +317,8 @@ class HWAddressSanitizer {
     Value *MemTag = nullptr;
   };
 
+  bool selectiveInstrumentationShouldSkip(Function &F,
+                                          FunctionAnalysisManager &FAM);
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -1523,6 +1525,31 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
   return true;
 }
 
+bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
+    Function &F, FunctionAnalysisManager &FAM) {
+  if (ClRandomSkipRate.getNumOccurrences()) {
+    std::bernoulli_distribution D(ClRandomSkipRate);
+    if (D(*Rng))
+      return true;
+  } else {
+    auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+    ProfileSummaryInfo *PSI =
+        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+    if (PSI && PSI->hasProfileSummary()) {
+      auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+      if ((ClHotPercentileCutoff.getNumOccurrences() &&
+           ClHotPercentileCutoff >= 0)
+              ? PSI->isFunctionHotInCallGraphNthPercentile(
+                    ClHotPercentileCutoff, &F, BFI)
+              : PSI->isFunctionHotInCallGraph(&F, BFI))
+        return true;
+    } else {
+      ++NumNoProfileSummaryFuncs;
+    }
+  }
+  return false;
+}
+
 void HWAddressSanitizer::sanitizeFunction(Function &F,
                                           FunctionAnalysisManager &FAM) {
   if (&F == HwasanCtorFunction)
@@ -1535,28 +1562,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
     return;
 
   NumTotalFuncs++;
-  if (CSelectiveInstrumentation) {
-    if (ClRandomSkipRate.getNumOccurrences()) {
-      std::bernoulli_distribution D(ClRandomSkipRate);
-      if (D(*Rng))
-        return;
-    } else {
-      auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-      ProfileSummaryInfo *PSI =
-          MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-      if (PSI && PSI->hasProfileSummary()) {
-        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-        if ((ClHotPercentileCutoff.getNumOccurrences() &&
-             ClHotPercentileCutoff >= 0)
-                ? PSI->isFunctionHotInCallGraphNthPercentile(
-                      ClHotPercentileCutoff, &F, BFI)
-                : PSI->isFunctionHotInCallGraph(&F, BFI))
-          return;
-      } else {
-        ++NumNoProfileSummaryFuncs;
-      }
-    }
-  }
+
+  if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM))
+    return;
+
   NumInstrumentedFuncs++;
 
   LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");

From 41658bafb70680d0aafb7e79c7f694b8c2a5217d Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Mon, 11 Mar 2024 21:39:21 -0400
Subject: [PATCH 69/95] [libc++][hardening] Add iterator validity checks on
 unordered containers (#80230)

These are simply null checks, so use `_LIBCPP_ASSERT_NON_NULL`. This
allows us to restore a bunch of the old debug tests. I've extended them
to also cover the const iterators, as those run through different
codepaths than the const ones.

This does the easier (and less important) half of #80212.
---
 libcxx/include/__hash_table                   | 40 +++++++++--
 .../assert.iterator.dereference.pass.cpp      | 52 ++++++++++++++
 .../assert.iterator.increment.pass.cpp        | 59 ++++++++++++++++
 ...assert.local_iterator.dereference.pass.cpp | 50 ++++++++++++++
 .../assert.local_iterator.increment.pass.cpp  | 66 ++++++++++++++++++
 .../debug.iterator.dereference.pass.cpp       | 41 ------------
 .../debug.iterator.increment.pass.cpp         | 46 -------------
 .../debug.local_iterator.dereference.pass.cpp | 39 -----------
 .../debug.local_iterator.increment.pass.cpp   | 49 --------------
 .../assert.iterator.dereference.pass.cpp      | 52 ++++++++++++++
 .../assert.iterator.increment.pass.cpp        | 59 ++++++++++++++++
 ...assert.local_iterator.dereference.pass.cpp | 50 ++++++++++++++
 .../assert.local_iterator.increment.pass.cpp  | 67 +++++++++++++++++++
 .../debug.iterator.dereference.pass.cpp       | 41 ------------
 .../debug.iterator.increment.pass.cpp         | 46 -------------
 .../debug.local_iterator.dereference.pass.cpp | 39 -----------
 .../debug.local_iterator.increment.pass.cpp   | 50 --------------
 .../assert.iterator.dereference.pass.cpp      | 46 +++++++++++++
 .../assert.iterator.increment.pass.cpp        | 58 ++++++++++++++++
 ...assert.local_iterator.dereference.pass.cpp | 48 +++++++++++++
 .../assert.local_iterator.increment.pass.cpp  | 64 ++++++++++++++++++
 .../debug.iterator.dereference.pass.cpp       | 39 -----------
 .../debug.iterator.increment.pass.cpp         | 47 -------------
 .../debug.local_iterator.dereference.pass.cpp | 41 ------------
 .../debug.local_iterator.increment.pass.cpp   | 51 --------------
 .../assert.iterator.dereference.pass.cpp      | 46 +++++++++++++
 .../assert.iterator.increment.pass.cpp        | 58 ++++++++++++++++
 ...assert.local_iterator.dereference.pass.cpp | 48 +++++++++++++
 .../assert.local_iterator.increment.pass.cpp  | 64 ++++++++++++++++++
 .../debug.iterator.dereference.pass.cpp       | 39 -----------
 .../debug.iterator.increment.pass.cpp         | 47 -------------
 .../debug.local_iterator.dereference.pass.cpp | 41 ------------
 .../debug.local_iterator.increment.pass.cpp   | 49 --------------
 33 files changed, 923 insertions(+), 709 deletions(-)
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp

diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index ec7d694c4a55f..e6691e78a267f 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -284,13 +284,21 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_iterator() _NOEXCEPT : __node_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container iterator");
     __node_ = __node_->__next_;
     return *this;
   }
@@ -345,12 +353,20 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_iterator(const __non_const_iterator& __x) _NOEXCEPT : __node_(__x.__node_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    return __node_->__upcast()->__get_value();
+  }
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container const_iterator");
     __node_ = __node_->__next_;
     return *this;
   }
@@ -400,13 +416,21 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_local_iterator() _NOEXCEPT : __node_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_local_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container local_iterator");
     __node_ = __node_->__next_;
     if (__node_ != nullptr && std::__constrain_hash(__node_->__hash(), __bucket_count_) != __bucket_)
       __node_ = nullptr;
@@ -475,13 +499,21 @@ public:
         __bucket_(__x.__bucket_),
         __bucket_count_(__x.__bucket_count_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_local_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container const_local_iterator");
     __node_ = __node_->__next_;
     if (__node_ != nullptr && std::__constrain_hash(__node_->__hash(), __bucket_count_) != __bucket_)
       __node_ = nullptr;
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..f57341d64ff39
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..3f4d1c2d3bdbb
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..8b47f54895560
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..8f8305833e077
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c({{42, std::string()}});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 5ea7f4d97fcc1..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp
deleted file mode 100644
index 2ed09bc81aaa9..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 2e4e62dbb41f4..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 28599263447a5..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c({{42, std::string()}});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..d295a82a8a1f5
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <string>
+#include <unordered_map>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..4247edc8def97
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..7ea87964e05f0
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..ffa3fec0ca1f1
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c({{1, std::string()}});
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 3dad48b3925d1..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <string>
-#include <unordered_map>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp
deleted file mode 100644
index b02bac6022f7c..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                            min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 9719ba5889759..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 2f74a191e8acd..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c({{1, std::string()}});
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..31edd6099c965
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..0e0e4aab303cd
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c;
+    c.insert(42);
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..fe833c40bc351
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..142c07f83c066
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c;
+    c.insert(42);
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 51cb9a6bff643..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp
deleted file mode 100644
index 17b8c77aadd1d..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c;
-        c.insert(42);
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 24102a47802fd..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 3f70ba2971581..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c;
-        c.insert(42);
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i,
-                                "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i,
-                                "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..8464601f61046
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..29446880900bc
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c;
+    c.insert(42);
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..7163e3735cee0
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..c9fe5afd09702
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c;
+    c.insert(42);
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 49663b4f824ae..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp
deleted file mode 100644
index da3fbdc5a6e8d..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c;
-        c.insert(42);
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 912edc2e4bf47..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 42a62aed472ca..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c;
-        c.insert(42);
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    return 0;
-}

From 2a3068455716a1a37da9155d4d96107901bba4a8 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Tue, 12 Mar 2024 10:51:11 +0900
Subject: [PATCH 70/95] [mlir][Transforms] Use correct listener in dialect
 conversion (#84861)

There was a typo in the dialect conversion: `RewriterBase::Listener`
should be used instead of `ForwardingListener`.
---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index cd49bd121a62e..2ec0b964b304f 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1020,8 +1020,8 @@ void BlockTypeConversionRewrite::commit(RewriterBase &rewriter) {
   // Inform the listener about all IR modifications that have already taken
   // place: References to the original block have been replaced with the new
   // block.
-  if (auto *listener = dyn_cast_or_null<RewriterBase::ForwardingListener>(
-          rewriter.getListener()))
+  if (auto *listener =
+          dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener()))
     for (Operation *op : block->getUsers())
       listener->notifyOperationModified(op);
 
@@ -1123,8 +1123,8 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
 void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); }
 
 void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
-  auto *listener = dyn_cast_or_null<RewriterBase::ForwardingListener>(
-      rewriter.getListener());
+  auto *listener =
+      dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener());
 
   // Compute replacement values.
   SmallVector<Value> replacements =

From 26722f5b61575fb0e58ff2933e7bea03353ff441 Mon Sep 17 00:00:00 2001
From: Sayan Saha <sayan.jubiee@gmail.com>
Date: Mon, 11 Mar 2024 22:37:33 -0400
Subject: [PATCH 71/95] [MLIR] Fix incorrect memref::DimOp canonicalization,
 add tensor::DimOp canonicalization (#84225)

The current canonicalization of `memref.dim` operating on the result of
`memref.reshape` into `memref.load` is incorrect as it doesn't check
whether the `index` operand of `memref.dim` dominates the source
`memref.reshape` op. It always introduces `memref.load` right after
`memref.reshape` to ensure the `memref` is not mutated before the
`memref.load` call. As a result, the following error is observed:

```
$> mlir-opt --canonicalize input.mlir

func.func @reshape_dim(%arg0: memref<*xf32>, %arg1: memref<?xindex>, %arg2: index) -> index {
    %c4 = arith.constant 4 : index
    %reshape = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
    %0 = arith.muli %arg2, %c4 : index
    %dim = memref.dim %reshape, %0 : memref<*xf32>
    return %dim : index
  }
```

results in:

```
dominator.mlir:22:12: error: operand #1 does not dominate this use
    %dim = memref.dim %reshape, %0 : memref<*xf32>
           ^
dominator.mlir:22:12: note: see current operation: %1 = "memref.load"(%arg1, %2) <{nontemporal = false}> : (memref<?xindex>, index) -> index
dominator.mlir:21:10: note: operand defined here (op in the same block)
    %0 = arith.muli %arg2, %c4 : index
```

Properly fixing this issue requires a dominator analysis which is
expensive to run within a canonicalization pattern. So, this patch fixes
the canonicalization pattern by being more strict/conservative about the
legality condition in which we perform this canonicalization.
The more general pattern is also added to `tensor.dim`. Since tensors are
immutable we don't need to worry about where to introduce the
`tensor.extract` call after canonicalization.
---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp   | 32 ++++++++-
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp   | 28 +++++++-
 mlir/test/Dialect/MemRef/canonicalize.mlir | 53 ++++++++++++++
 mlir/test/Dialect/Tensor/canonicalize.mlir | 80 ++++++++++++++++++++++
 4 files changed, 191 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 94e0ed319cae8..836dcb8f329e7 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1080,7 +1080,37 @@ struct DimOfMemRefReshape : public OpRewritePattern<DimOp> {
     auto reshape = dim.getSource().getDefiningOp<ReshapeOp>();
 
     if (!reshape)
-      return failure();
+      return rewriter.notifyMatchFailure(
+          dim, "Dim op is not defined by a reshape op.");
+
+    // dim of a memref reshape can be folded if dim.getIndex() dominates the
+    // reshape. Instead of using `DominanceInfo` (which is usually costly) we
+    // cheaply check that either of the following conditions hold:
+    //      1. dim.getIndex() is defined in the same block as reshape but before
+    //      reshape.
+    //      2. dim.getIndex() is defined in a parent block of
+    //      reshape.
+
+    // Check condition 1
+    if (dim.getIndex().getParentBlock() == reshape->getBlock()) {
+      if (auto *definingOp = dim.getIndex().getDefiningOp()) {
+        if (reshape->isBeforeInBlock(definingOp)) {
+          return rewriter.notifyMatchFailure(
+              dim,
+              "dim.getIndex is not defined before reshape in the same block.");
+        }
+      } // else dim.getIndex is a block argument to reshape->getBlock and
+        // dominates reshape
+    }   // Check condition 2
+    else if (dim->getBlock() != reshape->getBlock() &&
+             !dim.getIndex().getParentRegion()->isProperAncestor(
+                 reshape->getParentRegion())) {
+      // If dim and reshape are in the same block but dim.getIndex() isn't, we
+      // already know dim.getIndex() dominates reshape without calling
+      // `isProperAncestor`
+      return rewriter.notifyMatchFailure(
+          dim, "dim.getIndex does not dominate reshape.");
+    }
 
     // Place the load directly after the reshape to ensure that the shape memref
     // was not mutated.
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index a854da466c313..dc8843aa4e1e1 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -824,11 +824,37 @@ struct DimOfDestStyleOp : public OpRewritePattern<DimOp> {
     return success();
   }
 };
+
+/// Fold dim of a tensor reshape operation to a extract into the reshape's shape
+/// operand.
+struct DimOfReshapeOp : public OpRewritePattern<DimOp> {
+  using OpRewritePattern<DimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DimOp dim,
+                                PatternRewriter &rewriter) const override {
+    auto reshape = dim.getSource().getDefiningOp<ReshapeOp>();
+
+    if (!reshape)
+      return failure();
+
+    // Since tensors are immutable we don't need to worry about where to place
+    // the extract call
+    rewriter.setInsertionPointAfter(dim);
+    Location loc = dim.getLoc();
+    Value extract =
+        rewriter.create<ExtractOp>(loc, reshape.getShape(), dim.getIndex());
+    if (extract.getType() != dim.getType())
+      extract =
+          rewriter.create<arith::IndexCastOp>(loc, dim.getType(), extract);
+    rewriter.replaceOp(dim, extract);
+    return success();
+  }
+};
 } // namespace
 
 void DimOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.add<DimOfCastOp, DimOfDestStyleOp>(context);
+  results.add<DimOfCastOp, DimOfDestStyleOp, DimOfReshapeOp>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index b1e92e54d561d..506ed1f1c10b1 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -313,6 +313,59 @@ func.func @dim_of_memref_reshape_i32(%arg0: memref<*xf32>, %arg1: memref<?xi32>)
 
 // -----
 
+// Test case: memref.dim(memref.reshape %v %shp, %idx) -> memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_block_arg_index(
+//  CHECK-SAME:   %[[MEM:[0-9a-z]+]]: memref<*xf32>,
+//  CHECK-SAME:   %[[SHP:[0-9a-z]+]]: memref<?xindex>,
+//  CHECK-SAME:   %[[IDX:[0-9a-z]+]]: index
+//  CHECK-NEXT:   %[[DIM:.*]] = memref.load %[[SHP]][%[[IDX]]]
+//   CHECK-NOT:   memref.dim
+//       CHECK:   return %[[DIM]] : index
+func.func @dim_of_memref_reshape_block_arg_index(%arg0: memref<*xf32>, %arg1: memref<?xindex>, %arg2: index) -> index {
+  %reshape = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+  %dim = memref.dim %reshape, %arg2 : memref<*xf32>
+  return %dim : index
+}
+
+// -----
+
+// Test case: memref.dim(memref.reshape %v %shp, %idx) is not folded into memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_for(
+//       CHECK: memref.reshape
+//       CHECK: memref.dim
+//   CHECK-NOT: memref.load
+func.func @dim_of_memref_reshape_for( %arg0: memref<*xf32>, %arg1: memref<?xindex>) -> index {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+
+    %0 = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %c1) -> (index) {
+      %2 = memref.dim %0, %arg2 : memref<*xf32>
+      %3 = arith.muli %arg3, %2 : index
+      scf.yield %3 : index
+    }
+    return %1 : index
+}
+
+// -----
+
+// Test case: memref.dim(memref.reshape %v %shp, %idx) is not folded into memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_undominated(
+//       CHECK: memref.reshape
+//       CHECK: memref.dim
+//   CHECK-NOT: memref.load
+func.func @dim_of_memref_reshape_undominated(%arg0: memref<*xf32>, %arg1: memref<?xindex>, %arg2: index) -> index {
+    %c4 = arith.constant 4 : index
+    %reshape = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+    %0 = arith.muli %arg2, %c4 : index
+    %dim = memref.dim %reshape, %0 : memref<*xf32>
+    return %dim : index
+  }
+
+// -----
+
 // CHECK-LABEL: func @alloc_const_fold
 func.func @alloc_const_fold() -> memref<?xf32> {
   // CHECK-NEXT: memref.alloc() : memref<4xf32>
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 70f5d61bd802f..e5374f031be55 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2287,3 +2287,83 @@ func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> t
 // CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles
 // CHECK-SAME:    %[[SRC:[0-9a-zA-Z]+]]
 // CHECK:         return %[[SRC]]
+
+// -----
+
+// Test case: Folding of tensor.dim(tensor.reshape %v %shp, %idx) -> tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape(
+//  CHECK-SAME:     %[[MEM:[0-9a-z]+]]: tensor<*xf32>,
+//  CHECK-SAME:     %[[SHP:[0-9a-z]+]]: tensor<?xindex>
+//  CHECK-NEXT:   %[[IDX:.*]] = arith.constant 3
+//  CHECK-NEXT:   %[[DIM:.*]] = tensor.extract %[[SHP]][%[[IDX]]]
+//   CHECK-NOT:   tensor.store
+//   CHECK-NOT:   tensor.dim
+//   CHECK-NOT: tensor.reshape
+//       CHECK:   return %[[DIM]] : index
+func.func @dim_of_reshape(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>)
+    -> index {
+  %c3 = arith.constant 3 : index
+  %0 = tensor.reshape %arg0(%arg1)
+      : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // Update the shape to test that the load ends up in the right place.
+  tensor.insert %c3 into %arg1[%c3] : tensor<?xindex>
+  %1 = tensor.dim %0, %c3 : tensor<*xf32>
+  return %1 : index
+}
+
+// -----
+
+// Test case: Folding of tensor.dim(tensor.reshape %v %shp, %idx) -> tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_i32(
+//       CHECK:  tensor.extract
+//  CHECK-NEXT:  %[[CAST:.*]] = arith.index_cast
+//   CHECK-NOT:  tensor.dim
+//   CHECK-NOT:  tensor.reshape
+//       CHECK:  return %[[CAST]] : index
+func.func @dim_of_reshape_i32(%arg0: tensor<*xf32>, %arg1: tensor<?xi32>)
+    -> index {
+    %c3 = arith.constant 3 : index
+    %0 = tensor.reshape %arg0(%arg1)
+        : (tensor<*xf32>, tensor<?xi32>) -> tensor<*xf32>
+    %1 = tensor.dim %0, %c3 : tensor<*xf32>
+    return %1 : index
+}
+
+// -----
+
+// Test case: tensor.dim(tensor.reshape %v %shp, %idx) is folded into tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_for(
+//       CHECK: scf.for
+//  CHECK-NEXT: tensor.extract
+//   CHECK-NOT: tensor.dim
+//   CHECK-NOT: tensor.reshape
+func.func @dim_of_reshape_for( %arg0: tensor<*xf32>, %arg1: tensor<?xindex>) -> index {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+
+    %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %c1) -> (index) {
+      %2 = tensor.dim %0, %arg2 : tensor<*xf32>
+      %3 = arith.muli %arg3, %2 : index
+      scf.yield %3 : index
+    }
+    return %1 : index
+}
+
+// -----
+
+// Test case: tensor.dim(tensor.reshape %v %shp, %idx) is folded into tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_undominated(
+//       CHECK: arith.muli
+//  CHECK-NEXT: tensor.extract
+//   CHECK-NOT: tensor.dim
+//   CHECK-NOT: tensor.reshape
+func.func @dim_of_reshape_undominated(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>, %arg2: index) -> index {
+    %c4 = arith.constant 4 : index
+    %reshape = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+    %0 = arith.muli %arg2, %c4 : index
+    %dim = tensor.dim %reshape, %0 : tensor<*xf32>
+    return %dim : index
+  }

From e40cabfea48c617fe6efaace588e80474bc80fe8 Mon Sep 17 00:00:00 2001
From: lifengxiang1025 <lifengxiang.1025@bytedance.com>
Date: Tue, 12 Mar 2024 11:00:02 +0800
Subject: [PATCH 72/95] [MemProf] Match function's summary and definition
 strictly (#83665)

Problem description:
https://github.com/llvm/llvm-project/pull/81008#issuecomment-1933468520
Solution:
https://github.com/llvm/llvm-project/pull/81008#issuecomment-1934192548
(choose plan2)
---
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   4 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp    |  10 +-
 .../IPO/MemProfContextDisambiguation.cpp      |  24 +-
 llvm/test/ThinLTO/X86/summary-matching.ll     | 387 ++++++++++++++++++
 4 files changed, 416 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/summary-matching.ll

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index cbbbec0ccc8c4..cb892e30c4a0b 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -299,9 +299,7 @@ static cl::opt<bool> UseLoopVersioningLICM(
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
 
 namespace llvm {
-cl::opt<bool> EnableMemProfContextDisambiguation(
-    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
 
 extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 5c7a74dadb46a..68f9799616ae6 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -163,6 +163,10 @@ static cl::opt<std::string> WorkloadDefinitions(
              "}"),
     cl::Hidden);
 
+namespace llvm {
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
+}
+
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
                                         LLVMContext &Context) {
@@ -1643,7 +1647,9 @@ Expected<bool> FunctionImporter::importFunctions(
       if (Import) {
         if (Error Err = F.materialize())
           return std::move(Err);
-        if (EnableImportMetadata) {
+        // MemProf should match function's definition and summary,
+        // 'thinlto_src_module' is needed.
+        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
           // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
           // statistics and debugging.
           F.setMetadata(
@@ -1693,7 +1699,7 @@ Expected<bool> FunctionImporter::importFunctions(
         LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
                           << GO->getName() << " from "
                           << SrcModule->getSourceFileName() << "\n");
-        if (EnableImportMetadata) {
+        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
           // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
           // statistics and debugging.
           Fn->setMetadata(
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 271d3ed40030b..ba5e3b637db75 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -122,6 +122,10 @@ static cl::opt<unsigned>
                                  "frames through tail calls."));
 
 namespace llvm {
+cl::opt<bool> EnableMemProfContextDisambiguation(
+    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
+
 // Indicate we are linking with an allocator that supports hot/cold operator
 // new interfaces.
 cl::opt<bool> SupportsHotColdNew(
@@ -3375,10 +3379,22 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
 
     auto *GVSummary =
         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
-    if (!GVSummary)
-      // Must have been imported, use the first summary (might be multiple if
-      // this was a linkonce_odr).
-      GVSummary = TheFnVI.getSummaryList().front().get();
+    if (!GVSummary) {
+      // Must have been imported, use the summary which matches the definition。
+      // (might be multiple if this was a linkonce_odr).
+      auto SrcModuleMD = F.getMetadata("thinlto_src_module");
+      assert(SrcModuleMD &&
+             "enable-import-metadata is needed to emit thinlto_src_module");
+      StringRef SrcModule =
+          dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
+      for (auto &GVS : TheFnVI.getSummaryList()) {
+        if (GVS->modulePath() == SrcModule) {
+          GVSummary = GVS.get();
+          break;
+        }
+      }
+      assert(GVSummary && GVSummary->modulePath() == SrcModule);
+    }
 
     // If this was an imported alias skip it as we won't have the function
     // summary, and it should be cloned in the original module.
diff --git a/llvm/test/ThinLTO/X86/summary-matching.ll b/llvm/test/ThinLTO/X86/summary-matching.ll
new file mode 100644
index 0000000000000..60dc51b965d5a
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/summary-matching.ll
@@ -0,0 +1,387 @@
+;; Test to make sure that function's definiton and summary matches.
+; RUN: split-file %s %t
+; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
+; RUN: opt -thinlto-bc %t/b.ll >%t/b.o
+; RUN: opt -thinlto-bc %t/c.ll >%t/c.o
+
+; RUN: llvm-lto2 run %t/b.o %t/c.o %t/main.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new -o %t/a.out \
+; RUN: -r=%t/main.o,main,plx \
+; RUN: -r=%t/b.o,_Z1bv,plx \
+; RUN: -r=%t/b.o,_Z3fooIiET_S0_S0_,plx \
+; RUN: -r=%t/b.o,_Znwm \
+; RUN: -r=%t/c.o,_Z1cv,plx \
+; RUN: -r=%t/c.o,_Z3fooIiET_S0_S0_ \
+; RUN: -r=%t/c.o,_Z3barIiET_S0_S0_,plx \
+; RUN: -r=%t/c.o,_Znwm \
+; RUN: -r=%t/main.o,_Z1bv \
+; RUN: -r=%t/main.o,_Z1cv \
+; RUN: -r=%t/main.o,_Z3fooIiET_S0_S0_ 
+
+;; foo has two copys:
+;; foo in b.ll is prevailing and inlines bar.
+;; foo in c.ll isn't prevailing and doesn't inline bar.
+;; main will import foo in c.ll and foo's summary in b.ll default.
+
+;--- main.ll
+; ModuleID = 'main.cc'
+source_filename = "main.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress norecurse uwtable
+define dso_local noundef i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  %call = call noundef i32 @_Z1bv(), !callsite !6
+  %call1 = call noundef i32 @_Z1cv(), !callsite !7
+  %add = add nsw i32 %call, %call1
+  %call2 = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef 1, i32 noundef 2), !callsite !8
+  %add3 = add nsw i32 %add, %call2
+  ret i32 %add3
+}
+
+declare noundef i32 @_Z1bv() #1
+
+declare noundef i32 @_Z1cv() #1
+
+declare noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef, i32 noundef) #1
+
+attributes #0 = { mustprogress norecurse uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 1}
+!7 = !{i64 5}
+!8 = !{i64 7}
+
+;--- c.ll
+; ModuleID = 'c.cc'
+source_filename = "c.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooIiET_S0_S0_ = comdat any
+
+$_Z3barIiET_S0_S0_ = comdat any
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef i32 @_Z1cv() #0 {
+entry:
+  %num1 = alloca i32, align 4
+  %num2 = alloca i32, align 4
+  store i32 1, ptr %num1, align 4
+  store i32 1, ptr %num2, align 4
+  %0 = load i32, ptr %num1, align 4
+  %1 = load i32, ptr %num2, align 4
+  %call = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !6
+  ret i32 %call
+}
+
+; Function Attrs: mustprogress uwtable
+define linkonce_odr dso_local noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #3 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %rtn = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  %call = call noundef i32 @_Z3barIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !7
+  store i32 %call, ptr %rtn, align 4
+  %2 = load i32, ptr %rtn, align 4
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define linkonce_odr dso_local noundef i32 @_Z3barIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #0 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c = alloca ptr, align 8
+  %d = alloca ptr, align 8
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  %add1 = add nsw i32 %1, 1
+  store i32 %add1, ptr %b.addr, align 4
+  %2 = load i32, ptr %a.addr, align 4
+  %add2 = add nsw i32 %2, 1
+  store i32 %add2, ptr %a.addr, align 4
+  %3 = load i32, ptr %b.addr, align 4
+  %add3 = add nsw i32 %3, 1
+  store i32 %add3, ptr %b.addr, align 4
+  %4 = load i32, ptr %a.addr, align 4
+  %add4 = add nsw i32 %4, 1
+  store i32 %add4, ptr %a.addr, align 4
+  %5 = load i32, ptr %b.addr, align 4
+  %add5 = add nsw i32 %5, 1
+  store i32 %add5, ptr %b.addr, align 4
+  %6 = load i32, ptr %a.addr, align 4
+  %add6 = add nsw i32 %6, 1
+  store i32 %add6, ptr %a.addr, align 4
+  %7 = load i32, ptr %b.addr, align 4
+  %add7 = add nsw i32 %7, 1
+  store i32 %add7, ptr %b.addr, align 4
+  %8 = load i32, ptr %a.addr, align 4
+  %add8 = add nsw i32 %8, 1
+  store i32 %add8, ptr %a.addr, align 4
+  %9 = load i32, ptr %b.addr, align 4
+  %add9 = add nsw i32 %9, 1
+  store i32 %add9, ptr %b.addr, align 4
+  %10 = load i32, ptr %a.addr, align 4
+  %add10 = add nsw i32 %10, 1
+  store i32 %add10, ptr %a.addr, align 4
+  %11 = load i32, ptr %b.addr, align 4
+  %add11 = add nsw i32 %11, 1
+  store i32 %add11, ptr %b.addr, align 4
+  %12 = load i32, ptr %a.addr, align 4
+  %add12 = add nsw i32 %12, 1
+  store i32 %add12, ptr %a.addr, align 4
+  %13 = load i32, ptr %b.addr, align 4
+  %add13 = add nsw i32 %13, 1
+  store i32 %add13, ptr %b.addr, align 4
+  %14 = load i32, ptr %a.addr, align 4
+  %add14 = add nsw i32 %14, 1
+  store i32 %add14, ptr %a.addr, align 4
+  %15 = load i32, ptr %b.addr, align 4
+  %add15 = add nsw i32 %15, 1
+  store i32 %add15, ptr %b.addr, align 4
+  %16 = load i32, ptr %a.addr, align 4
+  %add16 = add nsw i32 %16, 1
+  store i32 %add16, ptr %a.addr, align 4
+  %17 = load i32, ptr %b.addr, align 4
+  %add17 = add nsw i32 %17, 1
+  store i32 %add17, ptr %b.addr, align 4
+  %18 = load i32, ptr %a.addr, align 4
+  %add18 = add nsw i32 %18, 1
+  store i32 %add18, ptr %a.addr, align 4
+  %19 = load i32, ptr %b.addr, align 4
+  %add19 = add nsw i32 %19, 1
+  store i32 %add19, ptr %b.addr, align 4
+  %20 = load i32, ptr %a.addr, align 4
+  %add20 = add nsw i32 %20, 1
+  store i32 %add20, ptr %a.addr, align 4
+  %21 = load i32, ptr %b.addr, align 4
+  %add21 = add nsw i32 %21, 1
+  store i32 %add21, ptr %b.addr, align 4
+  %22 = load i32, ptr %a.addr, align 4
+  %add22 = add nsw i32 %22, 1
+  store i32 %add22, ptr %a.addr, align 4
+  %23 = load i32, ptr %b.addr, align 4
+  %add23 = add nsw i32 %23, 1
+  store i32 %add23, ptr %b.addr, align 4
+  %call = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !8
+  store i32 1, ptr %call, align 4
+  store ptr %call, ptr %c, align 8
+  %call24 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !9
+  store i32 1, ptr %call24, align 4
+  store ptr %call24, ptr %d, align 8
+  %24 = load i32, ptr %a.addr, align 4
+  %25 = load i32, ptr %b.addr, align 4
+  %cmp = icmp sgt i32 %24, %25
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %26 = load i32, ptr %a.addr, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %27 = load i32, ptr %b.addr, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %26, %cond.true ], [ %27, %cond.false ]
+  ret i32 %cond
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znwm(i64 noundef) #1
+
+attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 6}
+!7 = !{i64 3}
+!8 = !{i64 4}
+!9 = !{i64 9}
+
+;--- b.ll
+; ModuleID = 'b.cc'
+source_filename = "b.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooIiET_S0_S0_ = comdat any
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef i32 @_Z1bv() #0 {
+entry:
+  %num1 = alloca i32, align 4
+  %num2 = alloca i32, align 4
+  store i32 0, ptr %num1, align 4
+  store i32 0, ptr %num2, align 4
+  %0 = load i32, ptr %num1, align 4
+  %1 = load i32, ptr %num2, align 4
+  %call = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !6
+  ret i32 %call
+}
+
+; Function Attrs: mustprogress uwtable
+define linkonce_odr dso_local noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #3 comdat {
+entry:
+  %a.addr.i = alloca i32, align 4
+  %b.addr.i = alloca i32, align 4
+  %c.i = alloca ptr, align 8
+  %d.i = alloca ptr, align 8
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %rtn = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  store i32 %0, ptr %a.addr.i, align 4
+  store i32 %1, ptr %b.addr.i, align 4
+  %2 = load i32, ptr %a.addr.i, align 4
+  %add.i = add nsw i32 %2, 1
+  store i32 %add.i, ptr %a.addr.i, align 4
+  %3 = load i32, ptr %b.addr.i, align 4
+  %add1.i = add nsw i32 %3, 1
+  store i32 %add1.i, ptr %b.addr.i, align 4
+  %4 = load i32, ptr %a.addr.i, align 4
+  %add2.i = add nsw i32 %4, 1
+  store i32 %add2.i, ptr %a.addr.i, align 4
+  %5 = load i32, ptr %b.addr.i, align 4
+  %add3.i = add nsw i32 %5, 1
+  store i32 %add3.i, ptr %b.addr.i, align 4
+  %6 = load i32, ptr %a.addr.i, align 4
+  %add4.i = add nsw i32 %6, 1
+  store i32 %add4.i, ptr %a.addr.i, align 4
+  %7 = load i32, ptr %b.addr.i, align 4
+  %add5.i = add nsw i32 %7, 1
+  store i32 %add5.i, ptr %b.addr.i, align 4
+  %8 = load i32, ptr %a.addr.i, align 4
+  %add6.i = add nsw i32 %8, 1
+  store i32 %add6.i, ptr %a.addr.i, align 4
+  %9 = load i32, ptr %b.addr.i, align 4
+  %add7.i = add nsw i32 %9, 1
+  store i32 %add7.i, ptr %b.addr.i, align 4
+  %10 = load i32, ptr %a.addr.i, align 4
+  %add8.i = add nsw i32 %10, 1
+  store i32 %add8.i, ptr %a.addr.i, align 4
+  %11 = load i32, ptr %b.addr.i, align 4
+  %add9.i = add nsw i32 %11, 1
+  store i32 %add9.i, ptr %b.addr.i, align 4
+  %12 = load i32, ptr %a.addr.i, align 4
+  %add10.i = add nsw i32 %12, 1
+  store i32 %add10.i, ptr %a.addr.i, align 4
+  %13 = load i32, ptr %b.addr.i, align 4
+  %add11.i = add nsw i32 %13, 1
+  store i32 %add11.i, ptr %b.addr.i, align 4
+  %14 = load i32, ptr %a.addr.i, align 4
+  %add12.i = add nsw i32 %14, 1
+  store i32 %add12.i, ptr %a.addr.i, align 4
+  %15 = load i32, ptr %b.addr.i, align 4
+  %add13.i = add nsw i32 %15, 1
+  store i32 %add13.i, ptr %b.addr.i, align 4
+  %16 = load i32, ptr %a.addr.i, align 4
+  %add14.i = add nsw i32 %16, 1
+  store i32 %add14.i, ptr %a.addr.i, align 4
+  %17 = load i32, ptr %b.addr.i, align 4
+  %add15.i = add nsw i32 %17, 1
+  store i32 %add15.i, ptr %b.addr.i, align 4
+  %18 = load i32, ptr %a.addr.i, align 4
+  %add16.i = add nsw i32 %18, 1
+  store i32 %add16.i, ptr %a.addr.i, align 4
+  %19 = load i32, ptr %b.addr.i, align 4
+  %add17.i = add nsw i32 %19, 1
+  store i32 %add17.i, ptr %b.addr.i, align 4
+  %20 = load i32, ptr %a.addr.i, align 4
+  %add18.i = add nsw i32 %20, 1
+  store i32 %add18.i, ptr %a.addr.i, align 4
+  %21 = load i32, ptr %b.addr.i, align 4
+  %add19.i = add nsw i32 %21, 1
+  store i32 %add19.i, ptr %b.addr.i, align 4
+  %22 = load i32, ptr %a.addr.i, align 4
+  %add20.i = add nsw i32 %22, 1
+  store i32 %add20.i, ptr %a.addr.i, align 4
+  %23 = load i32, ptr %b.addr.i, align 4
+  %add21.i = add nsw i32 %23, 1
+  store i32 %add21.i, ptr %b.addr.i, align 4
+  %24 = load i32, ptr %a.addr.i, align 4
+  %add22.i = add nsw i32 %24, 1
+  store i32 %add22.i, ptr %a.addr.i, align 4
+  %25 = load i32, ptr %b.addr.i, align 4
+  %add23.i = add nsw i32 %25, 1
+  store i32 %add23.i, ptr %b.addr.i, align 4
+  %call.i = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !7
+  store i32 1, ptr %call.i, align 4
+  store ptr %call.i, ptr %c.i, align 8
+  %call24.i = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !8
+  store i32 1, ptr %call24.i, align 4
+  store ptr %call24.i, ptr %d.i, align 8
+  %26 = load i32, ptr %a.addr.i, align 4
+  %27 = load i32, ptr %b.addr.i, align 4
+  %cmp.i = icmp sgt i32 %26, %27
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %entry
+  %28 = load i32, ptr %a.addr.i, align 4
+  br label %_Z3barIiET_S0_S0_.exit
+
+cond.false.i:                                     ; preds = %entry
+  %29 = load i32, ptr %b.addr.i, align 4
+  br label %_Z3barIiET_S0_S0_.exit
+
+_Z3barIiET_S0_S0_.exit:                           ; preds = %cond.true.i, %cond.false.i
+  %cond.i = phi i32 [ %28, %cond.true.i ], [ %29, %cond.false.i ]
+  store i32 %cond.i, ptr %rtn, align 4
+  %30 = load i32, ptr %rtn, align 4
+  ret i32 %30
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znwm(i64 noundef) #1
+
+attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 2}
+!7 = !{i64 4, i64 3}
+!8 = !{i64 9, i64 3}

From e4a546756c15a609be2f65d99c8b2be13ca9ddbf Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Mon, 11 Mar 2024 23:02:32 -0400
Subject: [PATCH 73/95] [MLIR][LSP][NFC] Fix a header guard (#84862)

This header guard is wrong and conflicts with the one from Transport.h
---
 mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
index 969058b022889..9ed8326a602e6 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
-#define MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
+#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
+#define MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
 
 #include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/Support/SourceMgr.h"

From a83f8e0314fcdda162e54cbba1c9dcf230dff093 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@google.com>
Date: Mon, 11 Mar 2024 23:40:47 -0400
Subject: [PATCH 74/95] [libc++][hardening] Check bounds on arithmetic in
 __bounded_iter (#78876)

Previously, `__bounded_iter` only checked `operator*`. It allowed the
pointer to go out of bounds with `operator++`, etc., and relied on
`operator*` (which checked `begin <= current < end`) to handle
everything. This has several unfortunate consequences:

First, pointer arithmetic is UB if it goes out of bounds. So by the time
`operator*` checks, it may be too late and the optimizer may have done
something bad. Checking both operations is safer.

Second, `std::copy` and friends currently bypass bounded iterator
checks. I think the only hope we have to fix this is to key on `iter +
n` doing a check. See #78771 for further discussion. Note this PR is not
sufficient to fix this. It adds the output bounds check, but ends up
doing it after the `memmove`, which is too late.

Finally, doing these checks is actually *more* optimizable. See #78829,
which is fixed by this PR. Keeping the iterator always in bounds means
`operator*` can rely on some invariants and only needs to check `current
!= end`. This aligns better with common iterator patterns, which use
`!=` instead of `<`, so it's easier to delete checks with local
reasoning.

See https://godbolt.org/z/vEWrWEf8h for how this new `__bounded_iter`
impacts compiler output. The old `__bounded_iter` injected checks inside
the loops for all the `sum()` functions, which not only added a check
inside a loop, but also impeded Clang's vectorization. The new
`__bounded_iter` allows all the checks to be optimized out and we emit
the same code as if it wasn't here.

Not everything is ideal however. `add_and_deref` ends up emitting two
comparisons now instead of one. This is because a missed optimization in
Clang. I've filed #78875 for that. I suspect (with no data) that this PR
is still a net performance win because impeding ranged-for loops is
particularly egregious. But ideally we'd fix the optimizer and make
`add_and_deref` fine too.

There's also something funny going on with `std::ranges::find` which I
have not yet figured out yet, but I suspect there are some further
missed optimization opportunities.

Fixes #78829.

(CC @danakj)
---
 libcxx/include/__iterator/bounded_iter.h      |  71 ++++---
 .../assert.iterator-indexing.pass.cpp         | 174 ++++++++++++++++++
 .../debug.iterator-indexing.pass.cpp          |  97 ----------
 .../bounded_iter/dereference.pass.cpp         |  14 +-
 .../assert.iterator-indexing.pass.cpp         | 158 ++++++++++++++++
 .../debug.iterator-indexing.pass.cpp          |  92 ---------
 6 files changed, 385 insertions(+), 221 deletions(-)
 create mode 100644 libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
 create mode 100644 libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp
 delete mode 100644 libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp

diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index 906ba3df0c578..a1a941ffbaaf1 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -31,13 +31,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Iterator wrapper that carries the valid range it is allowed to access.
 //
 // This is a simple iterator wrapper for contiguous iterators that points
-// within a [begin, end) range and carries these bounds with it. The iterator
-// ensures that it is pointing within that [begin, end) range when it is
-// dereferenced.
+// within a [begin, end] range and carries these bounds with it. The iterator
+// ensures that it is pointing within [begin, end) range when it is
+// dereferenced. It also ensures that it is never iterated outside of
+// [begin, end]. This is important for two reasons:
 //
-// Arithmetic operations are allowed and the bounds of the resulting iterator
-// are not checked. Hence, it is possible to create an iterator pointing outside
-// its range, but it is not possible to dereference it.
+// 1. It allows `operator*` and `operator++` bounds checks to be `iter != end`.
+//    This is both less for the optimizer to prove, and aligns with how callers
+//    typically use iterators.
+//
+// 2. Advancing an iterator out of bounds is undefined behavior (see the table
+//    in [input.iterators]). In particular, when the underlying iterator is a
+//    pointer, it is undefined at the language level (see [expr.add]). If
+//    bounded iterators exhibited this undefined behavior, we risk compiler
+//    optimizations deleting non-redundant bounds checks.
 template <class _Iterator, class = __enable_if_t< __libcpp_is_contiguous_iterator<_Iterator>::value > >
 struct __bounded_iter {
   using value_type        = typename iterator_traits<_Iterator>::value_type;
@@ -51,8 +58,8 @@ struct __bounded_iter {
 
   // Create a singular iterator.
   //
-  // Such an iterator does not point to any object and is conceptually out of bounds, so it is
-  // not dereferenceable. Observing operations like comparison and assignment are valid.
+  // Such an iterator points past the end of an empty span, so it is not dereferenceable.
+  // Observing operations like comparison and assignment are valid.
   _LIBCPP_HIDE_FROM_ABI __bounded_iter() = default;
 
   _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter const&) = default;
@@ -70,18 +77,20 @@ struct __bounded_iter {
 
 private:
   // Create an iterator wrapping the given iterator, and whose bounds are described
-  // by the provided [begin, end) range.
+  // by the provided [begin, end] range.
   //
-  // This constructor does not check whether the resulting iterator is within its bounds.
-  // However, it does check that the provided [begin, end) range is a valid range (that
-  // is, begin <= end).
+  // The constructor does not check whether the resulting iterator is within its bounds. It is a
+  // responsibility of the container to ensure that the given bounds are valid.
   //
   // Since it is non-standard for iterators to have this constructor, __bounded_iter must
   // be created via `std::__make_bounded_iter`.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __bounded_iter(
       _Iterator __current, _Iterator __begin, _Iterator __end)
       : __current_(__current), __begin_(__begin), __end_(__end) {
-    _LIBCPP_ASSERT_INTERNAL(__begin <= __end, "__bounded_iter(current, begin, end): [begin, end) is not a valid range");
+    _LIBCPP_ASSERT_INTERNAL(
+        __begin <= __current, "__bounded_iter(current, begin, end): current and begin are inconsistent");
+    _LIBCPP_ASSERT_INTERNAL(
+        __current <= __end, "__bounded_iter(current, begin, end): current and end are inconsistent");
   }
 
   template <class _It>
@@ -90,30 +99,37 @@ struct __bounded_iter {
 public:
   // Dereference and indexing operations.
   //
-  // These operations check that the iterator is dereferenceable, that is within [begin, end).
+  // These operations check that the iterator is dereferenceable. Since the class invariant is
+  // that the iterator is always within `[begin, end]`, we only need to check it's not pointing to
+  // `end`. This is easier for the optimizer because it aligns with the `iter != container.end()`
+  // checks that typical callers already use (see
+  // https://github.com/llvm/llvm-project/issues/78829).
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator*() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_), "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
+        __current_ != __end_, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
     return *__current_;
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pointer operator->() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
+        __current_ != __end_, "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
     return std::__to_address(__current_);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator[](difference_type __n) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_ + __n), "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
+        __n >= __begin_ - __current_, "__bounded_iter::operator[]: Attempt to index an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n < __end_ - __current_, "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
     return __current_[__n];
   }
 
   // Arithmetic operations.
   //
-  // These operations do not check that the resulting iterator is within the bounds, since that
-  // would make it impossible to create a past-the-end iterator.
+  // These operations check that the iterator remains within `[begin, end]`.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator++() _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __current_ != __end_, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
     ++__current_;
     return *this;
   }
@@ -124,6 +140,8 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator--() _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __current_ != __begin_, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
     --__current_;
     return *this;
   }
@@ -134,6 +152,10 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator+=(difference_type __n) _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n >= __begin_ - __current_, "__bounded_iter::operator+=: Attempt to rewind an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n <= __end_ - __current_, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
     __current_ += __n;
     return *this;
   }
@@ -151,6 +173,10 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator-=(difference_type __n) _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n <= __current_ - __begin_, "__bounded_iter::operator-=: Attempt to rewind an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n >= __current_ - __end_, "__bounded_iter::operator-=: Attempt to advance an iterator past the end");
     __current_ -= __n;
     return *this;
   }
@@ -197,15 +223,10 @@ struct __bounded_iter {
   }
 
 private:
-  // Return whether the given iterator is in the bounds of this __bounded_iter.
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Iterator const& __iter) const {
-    return __iter >= __begin_ && __iter < __end_;
-  }
-
   template <class>
   friend struct pointer_traits;
   _Iterator __current_;       // current iterator
-  _Iterator __begin_, __end_; // valid range represented as [begin, end)
+  _Iterator __begin_, __end_; // valid range represented as [begin, end]
 };
 
 template <class _It>
diff --git a/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp
new file mode 100644
index 0000000000000..d4dacb1f2f1c7
--- /dev/null
+++ b/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp
@@ -0,0 +1,174 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Make sure that std::span's iterators check for OOB accesses when the debug mode is enabled.
+
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=none
+
+#include <span>
+
+#include "check_assertion.h"
+
+struct Foo {
+  int x;
+};
+
+template <typename Iter>
+void test_iterator(Iter begin, Iter end, bool reverse) {
+  std::ptrdiff_t distance = std::distance(begin, end);
+
+  // Dereferencing an iterator at the end.
+  {
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *end,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+#if _LIBCPP_STD_VER >= 20
+    // In C++20 mode, std::reverse_iterator implements operator->, but not operator*, with
+    // std::prev instead of operator--. std::prev ultimately calls operator+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end->x,
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#else
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end->x,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#endif
+  }
+
+  // Incrementing an iterator past the end.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator++: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it++, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(++it, msg);
+  }
+
+  // Decrementing an iterator past the start.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator++: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator--: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it--, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(--it, msg);
+  }
+
+  // Advancing past the end with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator+=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + 1, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (distance + 1), msg);
+  }
+
+  // Advancing past the end with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator-=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (-1), msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator+=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (-1), msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator-=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - 1, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (distance + 1), msg);
+  }
+
+  // Out-of-bounds operator[].
+  {
+    [[maybe_unused]] const char* end_msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_end_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_start_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator[]: Attempt to index an iterator past the start";
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance + 1], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-1], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-99], past_start_msg);
+
+    auto it = begin + 1;
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance - 1], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-2], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-99], past_start_msg);
+  }
+}
+
+int main(int, char**) {
+  // span<T>::iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo> const span(array, 3);
+    test_iterator(span.begin(), span.end(), /*reverse=*/false);
+  }
+
+  // span<T, N>::iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo, 3> const span(array, 3);
+    test_iterator(span.begin(), span.end(), /*reverse=*/false);
+  }
+
+  // span<T>::reverse_iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo> const span(array, 3);
+    test_iterator(span.rbegin(), span.rend(), /*reverse=*/true);
+  }
+
+  // span<T, N>::reverse_iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo, 3> const span(array, 3);
+    test_iterator(span.rbegin(), span.rend(), /*reverse=*/true);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
deleted file mode 100644
index 360e7a981a0df..0000000000000
--- a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// Make sure that std::span's iterators check for OOB accesses when the debug mode is enabled.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
-// UNSUPPORTED: libcpp-hardening-mode=none
-
-#include <span>
-
-#include "check_assertion.h"
-
-struct Foo {
-    int x;
-};
-
-int main(int, char**) {
-    // span<T>::iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo> const span(array, 3);
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.begin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-        }
-    }
-
-    // span<T, N>::iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo, 3> const span(array, 3);
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.begin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-        }
-    }
-
-    // span<T>::reverse_iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo> const span(array, 3);
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rbegin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-    }
-
-    // span<T, N>::reverse_iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo, 3> const span(array, 3);
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rbegin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
index 8eee4ad2f319a..bf723f14e80a9 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
@@ -58,15 +58,15 @@ void test_death() {
   std::__bounded_iter<Iter> const oob  = std::__make_bounded_iter(Iter(e), Iter(b), Iter(e));
 
   // operator*
-  TEST_LIBCPP_ASSERT_FAILURE(*oob, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(*oob, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
   // operator->
-  TEST_LIBCPP_ASSERT_FAILURE(oob->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(oob->x, "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
   // operator[]
-  TEST_LIBCPP_ASSERT_FAILURE(iter[-1], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(iter[5], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[0], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[1], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[-6], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
+  TEST_LIBCPP_ASSERT_FAILURE(iter[-1], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
+  TEST_LIBCPP_ASSERT_FAILURE(iter[5], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[0], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[-6], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp b/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp
new file mode 100644
index 0000000000000..5043a88cbc3da
--- /dev/null
+++ b/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure that std::string_view's iterators check for OOB accesses when the debug mode is enabled.
+
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=none
+
+#include <iterator>
+#include <string_view>
+
+#include "check_assertion.h"
+
+template <typename Iter>
+void test_iterator(Iter begin, Iter end, bool reverse) {
+  ptrdiff_t distance = std::distance(begin, end);
+
+  // Dereferencing an iterator at the end.
+  {
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *end,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+#if _LIBCPP_STD_VER >= 20
+    // In C++20 mode, std::reverse_iterator implements operator->, but not operator*, with
+    // std::prev instead of operator--. std::prev ultimately calls operator+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end.operator->(),
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#else
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end.operator->(),
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#endif
+  }
+
+  // Incrementing an iterator past the end.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator++: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it++, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(++it, msg);
+  }
+
+  // Decrementing an iterator past the start.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator++: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator--: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it--, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(--it, msg);
+  }
+
+  // Advancing past the end with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator+=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + 1, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (distance + 1), msg);
+  }
+
+  // Advancing past the end with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator-=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (-1), msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator+=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (-1), msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator-=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - 1, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (distance + 1), msg);
+  }
+
+  // Out-of-bounds operator[].
+  {
+    [[maybe_unused]] const char* end_msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_end_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_start_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator[]: Attempt to index an iterator past the start";
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance + 1], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-1], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-99], past_start_msg);
+
+    auto it = begin + 1;
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance - 1], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-2], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-99], past_start_msg);
+  }
+}
+
+int main(int, char**) {
+  std::string_view const str("hello world");
+
+  // string_view::iterator
+  test_iterator(str.begin(), str.end(), /*reverse=*/false);
+
+  // string_view::const_iterator
+  test_iterator(str.cbegin(), str.cend(), /*reverse=*/false);
+
+  // string_view::reverse_iterator
+  test_iterator(str.rbegin(), str.rend(), /*reverse=*/true);
+
+  // string_view::const_reverse_iterator
+  test_iterator(str.crbegin(), str.crend(), /*reverse=*/true);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
deleted file mode 100644
index 5064319a0aee1..0000000000000
--- a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Make sure that std::string_view's iterators check for OOB accesses when the debug mode is enabled.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
-// UNSUPPORTED: libcpp-hardening-mode=none
-
-#include <string_view>
-
-#include "check_assertion.h"
-
-int main(int, char**) {
-  // string_view::iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.end();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.end();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.begin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-    }
-  }
-
-  // string_view::const_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.cend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.cend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.cbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-    }
-  }
-
-  // string_view::reverse_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.rend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.rend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.rbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-  }
-
-  // string_view::const_reverse_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.crend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.crend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.crbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-  }
-
-  return 0;
-}

From d02d8df0cd797342f7042440e07133e99ad5e0a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 11 Mar 2024 15:42:02 +0100
Subject: [PATCH 75/95] [clang][Interp] Implement _Complex negation

Factor complex unary operations into their own function.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 89 ++++++++++++++++++++----
 clang/lib/AST/Interp/ByteCodeExprGen.h   |  1 +
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index a384e191464fe..0dd645990d1d5 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2959,6 +2959,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
+  if (SubExpr->getType()->isAnyComplexType())
+    return this->VisitComplexUnaryOperator(E);
   std::optional<PrimType> T = classify(SubExpr->getType());
 
   switch (E->getOpcode()) {
@@ -3109,16 +3111,81 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return false;
     return DiscardResult ? this->emitPop(*T, E) : this->emitComp(*T, E);
   case UO_Real: // __real x
-    if (T)
-      return this->delegate(SubExpr);
-    return this->emitComplexReal(SubExpr);
+    assert(T);
+    return this->delegate(SubExpr);
   case UO_Imag: { // __imag x
-    if (T) {
-      if (!this->discard(SubExpr))
+    assert(T);
+    if (!this->discard(SubExpr))
+      return false;
+    return this->visitZeroInitializer(*T, SubExpr->getType(), SubExpr);
+  }
+  case UO_Extension:
+    return this->delegate(SubExpr);
+  case UO_Coawait:
+    assert(false && "Unhandled opcode");
+  }
+
+  return false;
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
+    const UnaryOperator *E) {
+  const Expr *SubExpr = E->getSubExpr();
+  assert(SubExpr->getType()->isAnyComplexType());
+
+  if (DiscardResult)
+    return this->discard(SubExpr);
+
+  std::optional<PrimType> ResT = classify(E);
+
+  // Prepare storage for result.
+  if (!ResT && !Initializing) {
+    std::optional<unsigned> LocalIndex =
+        allocateLocal(SubExpr, /*IsExtended=*/false);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
+      return false;
+  }
+
+  // The offset of the temporary, if we created one.
+  unsigned SubExprOffset = ~0u;
+  auto createTemp = [=, &SubExprOffset]() -> bool {
+    SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false);
+    if (!this->visit(SubExpr))
+      return false;
+    return this->emitSetLocal(PT_Ptr, SubExprOffset, E);
+  };
+
+  PrimType ElemT = classifyComplexElementType(SubExpr->getType());
+  auto getElem = [=](unsigned Offset, unsigned Index) -> bool {
+    if (!this->emitGetLocal(PT_Ptr, Offset, E))
+      return false;
+    return this->emitArrayElemPop(ElemT, Index, E);
+  };
+
+  switch (E->getOpcode()) {
+  case UO_Minus:
+    if (!createTemp())
+      return false;
+    for (unsigned I = 0; I != 2; ++I) {
+      if (!getElem(SubExprOffset, I))
+        return false;
+      if (!this->emitNeg(ElemT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, I, E))
         return false;
-      return this->visitZeroInitializer(*T, SubExpr->getType(), SubExpr);
     }
+    break;
+
+  case UO_AddrOf:
+    return this->delegate(SubExpr);
 
+  case UO_Real:
+    return this->emitComplexReal(SubExpr);
+
+  case UO_Imag:
     if (!this->visit(SubExpr))
       return false;
 
@@ -3131,14 +3198,12 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     // Since our _Complex implementation does not map to a primitive type,
     // we sometimes have to do the lvalue-to-rvalue conversion here manually.
     return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E);
-  }
-  case UO_Extension:
-    return this->delegate(SubExpr);
-  case UO_Coawait:
-    assert(false && "Unhandled opcode");
+
+  default:
+    return this->emitInvalid(E);
   }
 
-  return false;
+  return true;
 }
 
 template <class Emitter>
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5977bb5e6ff25..5ad2e74d7c269 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -75,6 +75,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitGNUNullExpr(const GNUNullExpr *E);
   bool VisitCXXThisExpr(const CXXThisExpr *E);
   bool VisitUnaryOperator(const UnaryOperator *E);
+  bool VisitComplexUnaryOperator(const UnaryOperator *E);
   bool VisitDeclRefExpr(const DeclRefExpr *E);
   bool VisitImplicitValueInitExpr(const ImplicitValueInitExpr *E);
   bool VisitSubstNonTypeTemplateParmExpr(const SubstNonTypeTemplateParmExpr *E);

From 71590e7d1ec29c3ba9f6f5b4cfe36345a7ccd25b Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 12 Mar 2024 13:08:44 +0800
Subject: [PATCH 76/95] [X86][test] Add missing enc/dec tests for CTEST

These tests were accidentally missed in #83863
---
 llvm/test/MC/Disassembler/X86/apx/ctest.txt | 1026 +++++++++++++++++++
 llvm/test/MC/X86/apx/ctest-att.s            |  773 ++++++++++++++
 llvm/test/MC/X86/apx/ctest-intel.s          |  770 ++++++++++++++
 3 files changed, 2569 insertions(+)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/ctest.txt
 create mode 100644 llvm/test/MC/X86/apx/ctest-att.s
 create mode 100644 llvm/test/MC/X86/apx/ctest-intel.s

diff --git a/llvm/test/MC/Disassembler/X86/apx/ctest.txt b/llvm/test/MC/Disassembler/X86/apx/ctest.txt
new file mode 100644
index 0000000000000..9a29a98b5d788
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/ctest.txt
@@ -0,0 +1,1026 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbb {dfv=of} $123, %bl
+# INTEL: ctestb {dfv=of} bl, 123
+0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b
+
+# ATT:   ctestbw {dfv=of} $1234, %dx
+# INTEL: ctestb {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestbl {dfv=of} $123456, %ecx
+# INTEL: ctestb {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbq {dfv=of} $123456, %r9
+# INTEL: ctestb {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbb {dfv=of} %bl, %dl
+# INTEL: ctestb {dfv=of} dl, bl
+0x62,0xf4,0x44,0x02,0x84,0xda
+
+# ATT:   ctestbw {dfv=of} %dx, %ax
+# INTEL: ctestb {dfv=of} ax, dx
+0x62,0xf4,0x45,0x02,0x85,0xd0
+
+# ATT:   ctestbl {dfv=of} %ecx, %edx
+# INTEL: ctestb {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x02,0x85,0xca
+
+# ATT:   ctestbq {dfv=of} %r9, %r15
+# INTEL: ctestb {dfv=of} r15, r9
+0x62,0x54,0xc4,0x02,0x85,0xcf
+
+# ATT:   ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbeb {dfv=of} $123, %bl
+# INTEL: ctestbe {dfv=of} bl, 123
+0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b
+
+# ATT:   ctestbew {dfv=of} $1234, %dx
+# INTEL: ctestbe {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestbel {dfv=of} $123456, %ecx
+# INTEL: ctestbe {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeq {dfv=of} $123456, %r9
+# INTEL: ctestbe {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeb {dfv=of} %bl, %dl
+# INTEL: ctestbe {dfv=of} dl, bl
+0x62,0xf4,0x44,0x06,0x84,0xda
+
+# ATT:   ctestbew {dfv=of} %dx, %ax
+# INTEL: ctestbe {dfv=of} ax, dx
+0x62,0xf4,0x45,0x06,0x85,0xd0
+
+# ATT:   ctestbel {dfv=of} %ecx, %edx
+# INTEL: ctestbe {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x06,0x85,0xca
+
+# ATT:   ctestbeq {dfv=of} %r9, %r15
+# INTEL: ctestbe {dfv=of} r15, r9
+0x62,0x54,0xc4,0x06,0x85,0xcf
+
+# ATT:   ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestfb {dfv=of} $123, %bl
+# INTEL: ctestf {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b
+
+# ATT:   ctestfw {dfv=of} $1234, %dx
+# INTEL: ctestf {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestfl {dfv=of} $123456, %ecx
+# INTEL: ctestf {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfq {dfv=of} $123456, %r9
+# INTEL: ctestf {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfb {dfv=of} %bl, %dl
+# INTEL: ctestf {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0b,0x84,0xda
+
+# ATT:   ctestfw {dfv=of} %dx, %ax
+# INTEL: ctestf {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0b,0x85,0xd0
+
+# ATT:   ctestfl {dfv=of} %ecx, %edx
+# INTEL: ctestf {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0b,0x85,0xca
+
+# ATT:   ctestfq {dfv=of} %r9, %r15
+# INTEL: ctestf {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0b,0x85,0xcf
+
+# ATT:   ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestlb {dfv=of} $123, %bl
+# INTEL: ctestl {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b
+
+# ATT:   ctestlw {dfv=of} $1234, %dx
+# INTEL: ctestl {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestll {dfv=of} $123456, %ecx
+# INTEL: ctestl {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlq {dfv=of} $123456, %r9
+# INTEL: ctestl {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlb {dfv=of} %bl, %dl
+# INTEL: ctestl {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0c,0x84,0xda
+
+# ATT:   ctestlw {dfv=of} %dx, %ax
+# INTEL: ctestl {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0c,0x85,0xd0
+
+# ATT:   ctestll {dfv=of} %ecx, %edx
+# INTEL: ctestl {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0c,0x85,0xca
+
+# ATT:   ctestlq {dfv=of} %r9, %r15
+# INTEL: ctestl {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0c,0x85,0xcf
+
+# ATT:   ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestleb {dfv=of} $123, %bl
+# INTEL: ctestle {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b
+
+# ATT:   ctestlew {dfv=of} $1234, %dx
+# INTEL: ctestle {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestlel {dfv=of} $123456, %ecx
+# INTEL: ctestle {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleq {dfv=of} $123456, %r9
+# INTEL: ctestle {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleb {dfv=of} %bl, %dl
+# INTEL: ctestle {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0e,0x84,0xda
+
+# ATT:   ctestlew {dfv=of} %dx, %ax
+# INTEL: ctestle {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0e,0x85,0xd0
+
+# ATT:   ctestlel {dfv=of} %ecx, %edx
+# INTEL: ctestle {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0e,0x85,0xca
+
+# ATT:   ctestleq {dfv=of} %r9, %r15
+# INTEL: ctestle {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0e,0x85,0xcf
+
+# ATT:   ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaeb {dfv=of} $123, %bl
+# INTEL: ctestae {dfv=of} bl, 123
+0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b
+
+# ATT:   ctestaew {dfv=of} $1234, %dx
+# INTEL: ctestae {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestael {dfv=of} $123456, %ecx
+# INTEL: ctestae {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeq {dfv=of} $123456, %r9
+# INTEL: ctestae {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeb {dfv=of} %bl, %dl
+# INTEL: ctestae {dfv=of} dl, bl
+0x62,0xf4,0x44,0x03,0x84,0xda
+
+# ATT:   ctestaew {dfv=of} %dx, %ax
+# INTEL: ctestae {dfv=of} ax, dx
+0x62,0xf4,0x45,0x03,0x85,0xd0
+
+# ATT:   ctestael {dfv=of} %ecx, %edx
+# INTEL: ctestae {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x03,0x85,0xca
+
+# ATT:   ctestaeq {dfv=of} %r9, %r15
+# INTEL: ctestae {dfv=of} r15, r9
+0x62,0x54,0xc4,0x03,0x85,0xcf
+
+# ATT:   ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestab {dfv=of} $123, %bl
+# INTEL: ctesta {dfv=of} bl, 123
+0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b
+
+# ATT:   ctestaw {dfv=of} $1234, %dx
+# INTEL: ctesta {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestal {dfv=of} $123456, %ecx
+# INTEL: ctesta {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaq {dfv=of} $123456, %r9
+# INTEL: ctesta {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestab {dfv=of} %bl, %dl
+# INTEL: ctesta {dfv=of} dl, bl
+0x62,0xf4,0x44,0x07,0x84,0xda
+
+# ATT:   ctestaw {dfv=of} %dx, %ax
+# INTEL: ctesta {dfv=of} ax, dx
+0x62,0xf4,0x45,0x07,0x85,0xd0
+
+# ATT:   ctestal {dfv=of} %ecx, %edx
+# INTEL: ctesta {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x07,0x85,0xca
+
+# ATT:   ctestaq {dfv=of} %r9, %r15
+# INTEL: ctesta {dfv=of} r15, r9
+0x62,0x54,0xc4,0x07,0x85,0xcf
+
+# ATT:   ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgeb {dfv=of} $123, %bl
+# INTEL: ctestge {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b
+
+# ATT:   ctestgew {dfv=of} $1234, %dx
+# INTEL: ctestge {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestgel {dfv=of} $123456, %ecx
+# INTEL: ctestge {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeq {dfv=of} $123456, %r9
+# INTEL: ctestge {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeb {dfv=of} %bl, %dl
+# INTEL: ctestge {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0d,0x84,0xda
+
+# ATT:   ctestgew {dfv=of} %dx, %ax
+# INTEL: ctestge {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0d,0x85,0xd0
+
+# ATT:   ctestgel {dfv=of} %ecx, %edx
+# INTEL: ctestge {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0d,0x85,0xca
+
+# ATT:   ctestgeq {dfv=of} %r9, %r15
+# INTEL: ctestge {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0d,0x85,0xcf
+
+# ATT:   ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgb {dfv=of} $123, %bl
+# INTEL: ctestg {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b
+
+# ATT:   ctestgw {dfv=of} $1234, %dx
+# INTEL: ctestg {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestgl {dfv=of} $123456, %ecx
+# INTEL: ctestg {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgq {dfv=of} $123456, %r9
+# INTEL: ctestg {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgb {dfv=of} %bl, %dl
+# INTEL: ctestg {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0f,0x84,0xda
+
+# ATT:   ctestgw {dfv=of} %dx, %ax
+# INTEL: ctestg {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0f,0x85,0xd0
+
+# ATT:   ctestgl {dfv=of} %ecx, %edx
+# INTEL: ctestg {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0f,0x85,0xca
+
+# ATT:   ctestgq {dfv=of} %r9, %r15
+# INTEL: ctestg {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0f,0x85,0xcf
+
+# ATT:   ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnob {dfv=of} $123, %bl
+# INTEL: ctestno {dfv=of} bl, 123
+0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b
+
+# ATT:   ctestnow {dfv=of} $1234, %dx
+# INTEL: ctestno {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnol {dfv=of} $123456, %ecx
+# INTEL: ctestno {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnoq {dfv=of} $123456, %r9
+# INTEL: ctestno {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnob {dfv=of} %bl, %dl
+# INTEL: ctestno {dfv=of} dl, bl
+0x62,0xf4,0x44,0x01,0x84,0xda
+
+# ATT:   ctestnow {dfv=of} %dx, %ax
+# INTEL: ctestno {dfv=of} ax, dx
+0x62,0xf4,0x45,0x01,0x85,0xd0
+
+# ATT:   ctestnol {dfv=of} %ecx, %edx
+# INTEL: ctestno {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x01,0x85,0xca
+
+# ATT:   ctestnoq {dfv=of} %r9, %r15
+# INTEL: ctestno {dfv=of} r15, r9
+0x62,0x54,0xc4,0x01,0x85,0xcf
+
+# ATT:   ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnsb {dfv=of} $123, %bl
+# INTEL: ctestns {dfv=of} bl, 123
+0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b
+
+# ATT:   ctestnsw {dfv=of} $1234, %dx
+# INTEL: ctestns {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnsl {dfv=of} $123456, %ecx
+# INTEL: ctestns {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsq {dfv=of} $123456, %r9
+# INTEL: ctestns {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsb {dfv=of} %bl, %dl
+# INTEL: ctestns {dfv=of} dl, bl
+0x62,0xf4,0x44,0x09,0x84,0xda
+
+# ATT:   ctestnsw {dfv=of} %dx, %ax
+# INTEL: ctestns {dfv=of} ax, dx
+0x62,0xf4,0x45,0x09,0x85,0xd0
+
+# ATT:   ctestnsl {dfv=of} %ecx, %edx
+# INTEL: ctestns {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x09,0x85,0xca
+
+# ATT:   ctestnsq {dfv=of} %r9, %r15
+# INTEL: ctestns {dfv=of} r15, r9
+0x62,0x54,0xc4,0x09,0x85,0xcf
+
+# ATT:   ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestneb {dfv=of} $123, %bl
+# INTEL: ctestne {dfv=of} bl, 123
+0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b
+
+# ATT:   ctestnew {dfv=of} $1234, %dx
+# INTEL: ctestne {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnel {dfv=of} $123456, %ecx
+# INTEL: ctestne {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneq {dfv=of} $123456, %r9
+# INTEL: ctestne {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneb {dfv=of} %bl, %dl
+# INTEL: ctestne {dfv=of} dl, bl
+0x62,0xf4,0x44,0x05,0x84,0xda
+
+# ATT:   ctestnew {dfv=of} %dx, %ax
+# INTEL: ctestne {dfv=of} ax, dx
+0x62,0xf4,0x45,0x05,0x85,0xd0
+
+# ATT:   ctestnel {dfv=of} %ecx, %edx
+# INTEL: ctestne {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x05,0x85,0xca
+
+# ATT:   ctestneq {dfv=of} %r9, %r15
+# INTEL: ctestne {dfv=of} r15, r9
+0x62,0x54,0xc4,0x05,0x85,0xcf
+
+# ATT:   ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestob {dfv=of} $123, %bl
+# INTEL: ctesto {dfv=of} bl, 123
+0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b
+
+# ATT:   ctestow {dfv=of} $1234, %dx
+# INTEL: ctesto {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestol {dfv=of} $123456, %ecx
+# INTEL: ctesto {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestoq {dfv=of} $123456, %r9
+# INTEL: ctesto {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestob {dfv=of} %bl, %dl
+# INTEL: ctesto {dfv=of} dl, bl
+0x62,0xf4,0x44,0x00,0x84,0xda
+
+# ATT:   ctestow {dfv=of} %dx, %ax
+# INTEL: ctesto {dfv=of} ax, dx
+0x62,0xf4,0x45,0x00,0x85,0xd0
+
+# ATT:   ctestol {dfv=of} %ecx, %edx
+# INTEL: ctesto {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x00,0x85,0xca
+
+# ATT:   ctestoq {dfv=of} %r9, %r15
+# INTEL: ctesto {dfv=of} r15, r9
+0x62,0x54,0xc4,0x00,0x85,0xcf
+
+# ATT:   ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestsb {dfv=of} $123, %bl
+# INTEL: ctests {dfv=of} bl, 123
+0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b
+
+# ATT:   ctestsw {dfv=of} $1234, %dx
+# INTEL: ctests {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestsl {dfv=of} $123456, %ecx
+# INTEL: ctests {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsq {dfv=of} $123456, %r9
+# INTEL: ctests {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsb {dfv=of} %bl, %dl
+# INTEL: ctests {dfv=of} dl, bl
+0x62,0xf4,0x44,0x08,0x84,0xda
+
+# ATT:   ctestsw {dfv=of} %dx, %ax
+# INTEL: ctests {dfv=of} ax, dx
+0x62,0xf4,0x45,0x08,0x85,0xd0
+
+# ATT:   ctestsl {dfv=of} %ecx, %edx
+# INTEL: ctests {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x08,0x85,0xca
+
+# ATT:   ctestsq {dfv=of} %r9, %r15
+# INTEL: ctests {dfv=of} r15, r9
+0x62,0x54,0xc4,0x08,0x85,0xcf
+
+# ATT:   ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b
+
+# ATT:   ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesttb {dfv=of} $123, %bl
+# INTEL: ctestt {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b
+
+# ATT:   ctesttw {dfv=of} $1234, %dx
+# INTEL: ctestt {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctesttl {dfv=of} $123456, %ecx
+# INTEL: ctestt {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttq {dfv=of} $123456, %r9
+# INTEL: ctestt {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttb {dfv=of} %bl, %dl
+# INTEL: ctestt {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0a,0x84,0xda
+
+# ATT:   ctesttw {dfv=of} %dx, %ax
+# INTEL: ctestt {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0a,0x85,0xd0
+
+# ATT:   ctesttl {dfv=of} %ecx, %edx
+# INTEL: ctestt {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0a,0x85,0xca
+
+# ATT:   ctesttq {dfv=of} %r9, %r15
+# INTEL: ctestt {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0a,0x85,0xcf
+
+# ATT:   ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesteb {dfv=of} $123, %bl
+# INTEL: cteste {dfv=of} bl, 123
+0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b
+
+# ATT:   ctestew {dfv=of} $1234, %dx
+# INTEL: cteste {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestel {dfv=of} $123456, %ecx
+# INTEL: cteste {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteq {dfv=of} $123456, %r9
+# INTEL: cteste {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteb {dfv=of} %bl, %dl
+# INTEL: cteste {dfv=of} dl, bl
+0x62,0xf4,0x44,0x04,0x84,0xda
+
+# ATT:   ctestew {dfv=of} %dx, %ax
+# INTEL: cteste {dfv=of} ax, dx
+0x62,0xf4,0x45,0x04,0x85,0xd0
+
+# ATT:   ctestel {dfv=of} %ecx, %edx
+# INTEL: cteste {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x04,0x85,0xca
+
+# ATT:   ctesteq {dfv=of} %r9, %r15
+# INTEL: cteste {dfv=of} r15, r9
+0x62,0x54,0xc4,0x04,0x85,0xcf
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
new file mode 100644
index 0000000000000..b9e98adc9841b
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -0,0 +1,773 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-256: error:
+# ERROR-NOT: error:
+# CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestbb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
+         ctestbb {dfv=of} $123, %bl
+# CHECK: ctestbw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04]
+         ctestbw {dfv=of} $1234, %dx
+# CHECK: ctestbl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbl {dfv=of} $123456, %ecx
+# CHECK: ctestbq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbq {dfv=of} $123456, %r9
+# CHECK: ctestbb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x84,0xda]
+         ctestbb {dfv=of} %bl, %dl
+# CHECK: ctestbw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0x85,0xd0]
+         ctestbw {dfv=of} %dx, %ax
+# CHECK: ctestbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestbl {dfv=of} %ecx, %edx
+# CHECK: ctestbq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0xcf]
+         ctestbq {dfv=of} %r9, %r15
+# CHECK: ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b]
+         ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b]
+         ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestbeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b]
+         ctestbeb {dfv=of} $123, %bl
+# CHECK: ctestbew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04]
+         ctestbew {dfv=of} $1234, %dx
+# CHECK: ctestbel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbel {dfv=of} $123456, %ecx
+# CHECK: ctestbeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbeq {dfv=of} $123456, %r9
+# CHECK: ctestbeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x84,0xda]
+         ctestbeb {dfv=of} %bl, %dl
+# CHECK: ctestbew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0x85,0xd0]
+         ctestbew {dfv=of} %dx, %ax
+# CHECK: ctestbel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestbel {dfv=of} %ecx, %edx
+# CHECK: ctestbeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0xcf]
+         ctestbeq {dfv=of} %r9, %r15
+# CHECK: ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b]
+         ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b]
+         ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestfb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b]
+         ctestfb {dfv=of} $123, %bl
+# CHECK: ctestfw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04]
+         ctestfw {dfv=of} $1234, %dx
+# CHECK: ctestfl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestfl {dfv=of} $123456, %ecx
+# CHECK: ctestfq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestfq {dfv=of} $123456, %r9
+# CHECK: ctestfb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x84,0xda]
+         ctestfb {dfv=of} %bl, %dl
+# CHECK: ctestfw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0x85,0xd0]
+         ctestfw {dfv=of} %dx, %ax
+# CHECK: ctestfl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x85,0xca]
+         ctestfl {dfv=of} %ecx, %edx
+# CHECK: ctestfq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0xcf]
+         ctestfq {dfv=of} %r9, %r15
+# CHECK: ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b]
+         ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b]
+         ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestlb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b]
+         ctestlb {dfv=of} $123, %bl
+# CHECK: ctestlw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04]
+         ctestlw {dfv=of} $1234, %dx
+# CHECK: ctestll {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestll {dfv=of} $123456, %ecx
+# CHECK: ctestlq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestlq {dfv=of} $123456, %r9
+# CHECK: ctestlb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x84,0xda]
+         ctestlb {dfv=of} %bl, %dl
+# CHECK: ctestlw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0x85,0xd0]
+         ctestlw {dfv=of} %dx, %ax
+# CHECK: ctestll {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestll {dfv=of} %ecx, %edx
+# CHECK: ctestlq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0xcf]
+         ctestlq {dfv=of} %r9, %r15
+# CHECK: ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b]
+         ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b]
+         ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestleb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b]
+         ctestleb {dfv=of} $123, %bl
+# CHECK: ctestlew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04]
+         ctestlew {dfv=of} $1234, %dx
+# CHECK: ctestlel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestlel {dfv=of} $123456, %ecx
+# CHECK: ctestleq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestleq {dfv=of} $123456, %r9
+# CHECK: ctestleb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x84,0xda]
+         ctestleb {dfv=of} %bl, %dl
+# CHECK: ctestlew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0x85,0xd0]
+         ctestlew {dfv=of} %dx, %ax
+# CHECK: ctestlel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestlel {dfv=of} %ecx, %edx
+# CHECK: ctestleq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0xcf]
+         ctestleq {dfv=of} %r9, %r15
+# CHECK: ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b]
+         ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b]
+         ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b]
+         ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b]
+         ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestaeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b]
+         ctestaeb {dfv=of} $123, %bl
+# CHECK: ctestaew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04]
+         ctestaew {dfv=of} $1234, %dx
+# CHECK: ctestael {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestael {dfv=of} $123456, %ecx
+# CHECK: ctestaeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestaeq {dfv=of} $123456, %r9
+# CHECK: ctestaeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x84,0xda]
+         ctestaeb {dfv=of} %bl, %dl
+# CHECK: ctestaew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0x85,0xd0]
+         ctestaew {dfv=of} %dx, %ax
+# CHECK: ctestael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestael {dfv=of} %ecx, %edx
+# CHECK: ctestaeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0xcf]
+         ctestaeq {dfv=of} %r9, %r15
+# CHECK: ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b]
+         ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b]
+         ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b]
+         ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b]
+         ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestab {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b]
+         ctestab {dfv=of} $123, %bl
+# CHECK: ctestaw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04]
+         ctestaw {dfv=of} $1234, %dx
+# CHECK: ctestal {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestal {dfv=of} $123456, %ecx
+# CHECK: ctestaq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestaq {dfv=of} $123456, %r9
+# CHECK: ctestab {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x84,0xda]
+         ctestab {dfv=of} %bl, %dl
+# CHECK: ctestaw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0x85,0xd0]
+         ctestaw {dfv=of} %dx, %ax
+# CHECK: ctestal {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctestal {dfv=of} %ecx, %edx
+# CHECK: ctestaq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0xcf]
+         ctestaq {dfv=of} %r9, %r15
+# CHECK: ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b]
+         ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b]
+         ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestgeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b]
+         ctestgeb {dfv=of} $123, %bl
+# CHECK: ctestgew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04]
+         ctestgew {dfv=of} $1234, %dx
+# CHECK: ctestgel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgel {dfv=of} $123456, %ecx
+# CHECK: ctestgeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgeq {dfv=of} $123456, %r9
+# CHECK: ctestgeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x84,0xda]
+         ctestgeb {dfv=of} %bl, %dl
+# CHECK: ctestgew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0x85,0xd0]
+         ctestgew {dfv=of} %dx, %ax
+# CHECK: ctestgel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestgel {dfv=of} %ecx, %edx
+# CHECK: ctestgeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0xcf]
+         ctestgeq {dfv=of} %r9, %r15
+# CHECK: ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b]
+         ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b]
+         ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestgb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b]
+         ctestgb {dfv=of} $123, %bl
+# CHECK: ctestgw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04]
+         ctestgw {dfv=of} $1234, %dx
+# CHECK: ctestgl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgl {dfv=of} $123456, %ecx
+# CHECK: ctestgq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgq {dfv=of} $123456, %r9
+# CHECK: ctestgb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x84,0xda]
+         ctestgb {dfv=of} %bl, %dl
+# CHECK: ctestgw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0x85,0xd0]
+         ctestgw {dfv=of} %dx, %ax
+# CHECK: ctestgl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestgl {dfv=of} %ecx, %edx
+# CHECK: ctestgq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0xcf]
+         ctestgq {dfv=of} %r9, %r15
+# CHECK: ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b]
+         ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b]
+         ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b]
+         ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b]
+         ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestnob {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b]
+         ctestnob {dfv=of} $123, %bl
+# CHECK: ctestnow {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04]
+         ctestnow {dfv=of} $1234, %dx
+# CHECK: ctestnol {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnol {dfv=of} $123456, %ecx
+# CHECK: ctestnoq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnoq {dfv=of} $123456, %r9
+# CHECK: ctestnob {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x84,0xda]
+         ctestnob {dfv=of} %bl, %dl
+# CHECK: ctestnow {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0x85,0xd0]
+         ctestnow {dfv=of} %dx, %ax
+# CHECK: ctestnol {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x85,0xca]
+         ctestnol {dfv=of} %ecx, %edx
+# CHECK: ctestnoq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0xcf]
+         ctestnoq {dfv=of} %r9, %r15
+# CHECK: ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b]
+         ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b]
+         ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b]
+         ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b]
+         ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestnsb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b]
+         ctestnsb {dfv=of} $123, %bl
+# CHECK: ctestnsw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04]
+         ctestnsw {dfv=of} $1234, %dx
+# CHECK: ctestnsl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnsl {dfv=of} $123456, %ecx
+# CHECK: ctestnsq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnsq {dfv=of} $123456, %r9
+# CHECK: ctestnsb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x84,0xda]
+         ctestnsb {dfv=of} %bl, %dl
+# CHECK: ctestnsw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0x85,0xd0]
+         ctestnsw {dfv=of} %dx, %ax
+# CHECK: ctestnsl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x85,0xca]
+         ctestnsl {dfv=of} %ecx, %edx
+# CHECK: ctestnsq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0xcf]
+         ctestnsq {dfv=of} %r9, %r15
+# CHECK: ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b]
+         ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b]
+         ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b]
+         ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b]
+         ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestneb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b]
+         ctestneb {dfv=of} $123, %bl
+# CHECK: ctestnew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04]
+         ctestnew {dfv=of} $1234, %dx
+# CHECK: ctestnel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnel {dfv=of} $123456, %ecx
+# CHECK: ctestneq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestneq {dfv=of} $123456, %r9
+# CHECK: ctestneb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x84,0xda]
+         ctestneb {dfv=of} %bl, %dl
+# CHECK: ctestnew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0x85,0xd0]
+         ctestnew {dfv=of} %dx, %ax
+# CHECK: ctestnel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestnel {dfv=of} %ecx, %edx
+# CHECK: ctestneq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0xcf]
+         ctestneq {dfv=of} %r9, %r15
+# CHECK: ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b]
+         ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b]
+         ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b]
+         ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b]
+         ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestob {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b]
+         ctestob {dfv=of} $123, %bl
+# CHECK: ctestow {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04]
+         ctestow {dfv=of} $1234, %dx
+# CHECK: ctestol {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestol {dfv=of} $123456, %ecx
+# CHECK: ctestoq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestoq {dfv=of} $123456, %r9
+# CHECK: ctestob {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x84,0xda]
+         ctestob {dfv=of} %bl, %dl
+# CHECK: ctestow {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0x85,0xd0]
+         ctestow {dfv=of} %dx, %ax
+# CHECK: ctestol {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x85,0xca]
+         ctestol {dfv=of} %ecx, %edx
+# CHECK: ctestoq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0xcf]
+         ctestoq {dfv=of} %r9, %r15
+# CHECK: ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b]
+         ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b]
+         ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b]
+         ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b]
+         ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestsb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b]
+         ctestsb {dfv=of} $123, %bl
+# CHECK: ctestsw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04]
+         ctestsw {dfv=of} $1234, %dx
+# CHECK: ctestsl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestsl {dfv=of} $123456, %ecx
+# CHECK: ctestsq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestsq {dfv=of} $123456, %r9
+# CHECK: ctestsb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x84,0xda]
+         ctestsb {dfv=of} %bl, %dl
+# CHECK: ctestsw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0x85,0xd0]
+         ctestsw {dfv=of} %dx, %ax
+# CHECK: ctestsl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x85,0xca]
+         ctestsl {dfv=of} %ecx, %edx
+# CHECK: ctestsq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0xcf]
+         ctestsq {dfv=of} %r9, %r15
+# CHECK: ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b]
+         ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b]
+         ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b]
+         ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b]
+         ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctesttb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b]
+         ctesttb {dfv=of} $123, %bl
+# CHECK: ctesttw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04]
+         ctesttw {dfv=of} $1234, %dx
+# CHECK: ctesttl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesttl {dfv=of} $123456, %ecx
+# CHECK: ctesttq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesttq {dfv=of} $123456, %r9
+# CHECK: ctesttb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x84,0xda]
+         ctesttb {dfv=of} %bl, %dl
+# CHECK: ctesttw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0x85,0xd0]
+         ctesttw {dfv=of} %dx, %ax
+# CHECK: ctesttl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x85,0xca]
+         ctesttl {dfv=of} %ecx, %edx
+# CHECK: ctesttq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0xcf]
+         ctesttq {dfv=of} %r9, %r15
+# CHECK: ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b]
+         ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b]
+         ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b]
+         ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b]
+         ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctesteb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b]
+         ctesteb {dfv=of} $123, %bl
+# CHECK: ctestew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04]
+         ctestew {dfv=of} $1234, %dx
+# CHECK: ctestel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestel {dfv=of} $123456, %ecx
+# CHECK: ctesteq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesteq {dfv=of} $123456, %r9
+# CHECK: ctesteb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x84,0xda]
+         ctesteb {dfv=of} %bl, %dl
+# CHECK: ctestew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0x85,0xd0]
+         ctestew {dfv=of} %dx, %ax
+# CHECK: ctestel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         ctestel {dfv=of} %ecx, %edx
+# CHECK: ctesteq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
+         ctesteq {dfv=of} %r9, %r15
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
new file mode 100644
index 0000000000000..17cea489b4765
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -0,0 +1,770 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestb {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
+         ctestb {dfv=of} bl, 123
+# CHECK: ctestb {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04]
+         ctestb {dfv=of} dx, 1234
+# CHECK: ctestb {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} ecx, 123456
+# CHECK: ctestb {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} r9, 123456
+# CHECK: ctestb {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x84,0xda]
+         ctestb {dfv=of} dl, bl
+# CHECK: ctestb {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0x85,0xd0]
+         ctestb {dfv=of} ax, dx
+# CHECK: ctestb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestb {dfv=of} edx, ecx
+# CHECK: ctestb {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0xcf]
+         ctestb {dfv=of} r15, r9
+# CHECK: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b]
+         ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b]
+         ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestbe {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b]
+         ctestbe {dfv=of} bl, 123
+# CHECK: ctestbe {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04]
+         ctestbe {dfv=of} dx, 1234
+# CHECK: ctestbe {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} ecx, 123456
+# CHECK: ctestbe {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} r9, 123456
+# CHECK: ctestbe {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x84,0xda]
+         ctestbe {dfv=of} dl, bl
+# CHECK: ctestbe {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0x85,0xd0]
+         ctestbe {dfv=of} ax, dx
+# CHECK: ctestbe {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestbe {dfv=of} edx, ecx
+# CHECK: ctestbe {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0xcf]
+         ctestbe {dfv=of} r15, r9
+# CHECK: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b]
+         ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b]
+         ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestf {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b]
+         ctestf {dfv=of} bl, 123
+# CHECK: ctestf {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04]
+         ctestf {dfv=of} dx, 1234
+# CHECK: ctestf {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} ecx, 123456
+# CHECK: ctestf {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} r9, 123456
+# CHECK: ctestf {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x84,0xda]
+         ctestf {dfv=of} dl, bl
+# CHECK: ctestf {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0x85,0xd0]
+         ctestf {dfv=of} ax, dx
+# CHECK: ctestf {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x85,0xca]
+         ctestf {dfv=of} edx, ecx
+# CHECK: ctestf {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0xcf]
+         ctestf {dfv=of} r15, r9
+# CHECK: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b]
+         ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b]
+         ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestl {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b]
+         ctestl {dfv=of} bl, 123
+# CHECK: ctestl {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04]
+         ctestl {dfv=of} dx, 1234
+# CHECK: ctestl {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} ecx, 123456
+# CHECK: ctestl {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} r9, 123456
+# CHECK: ctestl {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x84,0xda]
+         ctestl {dfv=of} dl, bl
+# CHECK: ctestl {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0x85,0xd0]
+         ctestl {dfv=of} ax, dx
+# CHECK: ctestl {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestl {dfv=of} edx, ecx
+# CHECK: ctestl {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0xcf]
+         ctestl {dfv=of} r15, r9
+# CHECK: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b]
+         ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b]
+         ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestle {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b]
+         ctestle {dfv=of} bl, 123
+# CHECK: ctestle {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04]
+         ctestle {dfv=of} dx, 1234
+# CHECK: ctestle {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} ecx, 123456
+# CHECK: ctestle {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} r9, 123456
+# CHECK: ctestle {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x84,0xda]
+         ctestle {dfv=of} dl, bl
+# CHECK: ctestle {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0x85,0xd0]
+         ctestle {dfv=of} ax, dx
+# CHECK: ctestle {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestle {dfv=of} edx, ecx
+# CHECK: ctestle {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0xcf]
+         ctestle {dfv=of} r15, r9
+# CHECK: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b]
+         ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b]
+         ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b]
+         ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b]
+         ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestae {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b]
+         ctestae {dfv=of} bl, 123
+# CHECK: ctestae {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04]
+         ctestae {dfv=of} dx, 1234
+# CHECK: ctestae {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} ecx, 123456
+# CHECK: ctestae {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} r9, 123456
+# CHECK: ctestae {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x84,0xda]
+         ctestae {dfv=of} dl, bl
+# CHECK: ctestae {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0x85,0xd0]
+         ctestae {dfv=of} ax, dx
+# CHECK: ctestae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestae {dfv=of} edx, ecx
+# CHECK: ctestae {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0xcf]
+         ctestae {dfv=of} r15, r9
+# CHECK: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b]
+         ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b]
+         ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b]
+         ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b]
+         ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctesta {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b]
+         ctesta {dfv=of} bl, 123
+# CHECK: ctesta {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04]
+         ctesta {dfv=of} dx, 1234
+# CHECK: ctesta {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} ecx, 123456
+# CHECK: ctesta {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} r9, 123456
+# CHECK: ctesta {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x84,0xda]
+         ctesta {dfv=of} dl, bl
+# CHECK: ctesta {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0x85,0xd0]
+         ctesta {dfv=of} ax, dx
+# CHECK: ctesta {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctesta {dfv=of} edx, ecx
+# CHECK: ctesta {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0xcf]
+         ctesta {dfv=of} r15, r9
+# CHECK: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b]
+         ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b]
+         ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestge {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b]
+         ctestge {dfv=of} bl, 123
+# CHECK: ctestge {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04]
+         ctestge {dfv=of} dx, 1234
+# CHECK: ctestge {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} ecx, 123456
+# CHECK: ctestge {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} r9, 123456
+# CHECK: ctestge {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x84,0xda]
+         ctestge {dfv=of} dl, bl
+# CHECK: ctestge {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0x85,0xd0]
+         ctestge {dfv=of} ax, dx
+# CHECK: ctestge {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestge {dfv=of} edx, ecx
+# CHECK: ctestge {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0xcf]
+         ctestge {dfv=of} r15, r9
+# CHECK: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b]
+         ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b]
+         ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestg {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b]
+         ctestg {dfv=of} bl, 123
+# CHECK: ctestg {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04]
+         ctestg {dfv=of} dx, 1234
+# CHECK: ctestg {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} ecx, 123456
+# CHECK: ctestg {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} r9, 123456
+# CHECK: ctestg {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x84,0xda]
+         ctestg {dfv=of} dl, bl
+# CHECK: ctestg {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0x85,0xd0]
+         ctestg {dfv=of} ax, dx
+# CHECK: ctestg {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestg {dfv=of} edx, ecx
+# CHECK: ctestg {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0xcf]
+         ctestg {dfv=of} r15, r9
+# CHECK: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b]
+         ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b]
+         ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b]
+         ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b]
+         ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestno {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b]
+         ctestno {dfv=of} bl, 123
+# CHECK: ctestno {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04]
+         ctestno {dfv=of} dx, 1234
+# CHECK: ctestno {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} ecx, 123456
+# CHECK: ctestno {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} r9, 123456
+# CHECK: ctestno {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x84,0xda]
+         ctestno {dfv=of} dl, bl
+# CHECK: ctestno {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0x85,0xd0]
+         ctestno {dfv=of} ax, dx
+# CHECK: ctestno {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x85,0xca]
+         ctestno {dfv=of} edx, ecx
+# CHECK: ctestno {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0xcf]
+         ctestno {dfv=of} r15, r9
+# CHECK: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b]
+         ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b]
+         ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b]
+         ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b]
+         ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestns {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b]
+         ctestns {dfv=of} bl, 123
+# CHECK: ctestns {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04]
+         ctestns {dfv=of} dx, 1234
+# CHECK: ctestns {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} ecx, 123456
+# CHECK: ctestns {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} r9, 123456
+# CHECK: ctestns {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x84,0xda]
+         ctestns {dfv=of} dl, bl
+# CHECK: ctestns {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0x85,0xd0]
+         ctestns {dfv=of} ax, dx
+# CHECK: ctestns {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x85,0xca]
+         ctestns {dfv=of} edx, ecx
+# CHECK: ctestns {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0xcf]
+         ctestns {dfv=of} r15, r9
+# CHECK: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b]
+         ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b]
+         ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b]
+         ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b]
+         ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestne {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b]
+         ctestne {dfv=of} bl, 123
+# CHECK: ctestne {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04]
+         ctestne {dfv=of} dx, 1234
+# CHECK: ctestne {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} ecx, 123456
+# CHECK: ctestne {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} r9, 123456
+# CHECK: ctestne {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x84,0xda]
+         ctestne {dfv=of} dl, bl
+# CHECK: ctestne {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0x85,0xd0]
+         ctestne {dfv=of} ax, dx
+# CHECK: ctestne {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestne {dfv=of} edx, ecx
+# CHECK: ctestne {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0xcf]
+         ctestne {dfv=of} r15, r9
+# CHECK: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b]
+         ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b]
+         ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b]
+         ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b]
+         ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctesto {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b]
+         ctesto {dfv=of} bl, 123
+# CHECK: ctesto {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04]
+         ctesto {dfv=of} dx, 1234
+# CHECK: ctesto {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} ecx, 123456
+# CHECK: ctesto {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} r9, 123456
+# CHECK: ctesto {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x84,0xda]
+         ctesto {dfv=of} dl, bl
+# CHECK: ctesto {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0x85,0xd0]
+         ctesto {dfv=of} ax, dx
+# CHECK: ctesto {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x85,0xca]
+         ctesto {dfv=of} edx, ecx
+# CHECK: ctesto {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0xcf]
+         ctesto {dfv=of} r15, r9
+# CHECK: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b]
+         ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b]
+         ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b]
+         ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b]
+         ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b]
+         ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctests {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b]
+         ctests {dfv=of} bl, 123
+# CHECK: ctests {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04]
+         ctests {dfv=of} dx, 1234
+# CHECK: ctests {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} ecx, 123456
+# CHECK: ctests {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} r9, 123456
+# CHECK: ctests {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x84,0xda]
+         ctests {dfv=of} dl, bl
+# CHECK: ctests {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0x85,0xd0]
+         ctests {dfv=of} ax, dx
+# CHECK: ctests {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x85,0xca]
+         ctests {dfv=of} edx, ecx
+# CHECK: ctests {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0xcf]
+         ctests {dfv=of} r15, r9
+# CHECK: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b]
+         ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b]
+         ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b]
+         ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b]
+         ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestt {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b]
+         ctestt {dfv=of} bl, 123
+# CHECK: ctestt {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04]
+         ctestt {dfv=of} dx, 1234
+# CHECK: ctestt {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} ecx, 123456
+# CHECK: ctestt {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} r9, 123456
+# CHECK: ctestt {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x84,0xda]
+         ctestt {dfv=of} dl, bl
+# CHECK: ctestt {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0x85,0xd0]
+         ctestt {dfv=of} ax, dx
+# CHECK: ctestt {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x85,0xca]
+         ctestt {dfv=of} edx, ecx
+# CHECK: ctestt {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0xcf]
+         ctestt {dfv=of} r15, r9
+# CHECK: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b]
+         cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b]
+         cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b]
+         cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b]
+         cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b]
+         cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: cteste {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b]
+         cteste {dfv=of} bl, 123
+# CHECK: cteste {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04]
+         cteste {dfv=of} dx, 1234
+# CHECK: cteste {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} ecx, 123456
+# CHECK: cteste {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} r9, 123456
+# CHECK: cteste {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x84,0xda]
+         cteste {dfv=of} dl, bl
+# CHECK: cteste {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0x85,0xd0]
+         cteste {dfv=of} ax, dx
+# CHECK: cteste {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         cteste {dfv=of} edx, ecx
+# CHECK: cteste {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
+         cteste {dfv=of} r15, r9

From e89b4bcf32b8f6ddce9d7e95659e9f092a55c021 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Tue, 12 Mar 2024 13:41:49 +0800
Subject: [PATCH 77/95] [X86] Remove SlowDivide tuning from GRTTuning (#84676)

The DIV32/64 throughput was improved since Goldmont in the Atom
architecture. The Alder Lake-E shows similar number too. So we shouldn't
add such tunings to Gracemont and later products.

Checked from Agner Fog's table and uops.info.
---
 llvm/lib/Target/X86/X86.td                       |  2 --
 .../CodeGen/X86/bypass-slow-division-tune.ll     | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a2a65ce75d6b9..8367f938c0ddf 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1237,8 +1237,6 @@ def ProcessorFeatures {
   // Gracemont
   list<SubtargetFeature> GRTTuning = [TuningMacroFusion,
                                       TuningSlow3OpsLEA,
-                                      TuningSlowDivide32,
-                                      TuningSlowDivide64,
                                       TuningFastScalarFSQRT,
                                       TuningFastVectorFSQRT,
                                       TuningFast15ByteNOP,
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
index 8369a44dcbad2..afecf00113a0a 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64     < %s | FileCheck -check-prefixes=CHECK,REST,X64 %s
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=CHECK,REST,SLM %s
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=CHECK,REST,SKL %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=goldmont   < %s | FileCheck -check-prefixes=CHECK,REST,GMT %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=gracemont  < %s | FileCheck -check-prefixes=CHECK,REST,GMT %s
 ; RUN: llc -profile-summary-huge-working-set-size-threshold=1 -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=HUGEWS %s
 
 ; Verify that div32 is bypassed only for Atoms.
@@ -117,6 +119,13 @@ define i64 @div64(i64 %a, i64 %b) {
 ; SKL-NEXT:    # kill: def $eax killed $eax def $rax
 ; SKL-NEXT:    retq
 ;
+; GMT-LABEL: div64:
+; GMT:       # %bb.0: # %entry
+; GMT-NEXT:    movq %rdi, %rax
+; GMT-NEXT:    cqto
+; GMT-NEXT:    idivq %rsi
+; GMT-NEXT:    retq
+;
 ; HUGEWS-LABEL: div64:
 ; HUGEWS:       # %bb.0: # %entry
 ; HUGEWS-NEXT:    movq %rdi, %rax
@@ -240,6 +249,13 @@ define i64 @div64_hugews(i64 %a, i64 %b) {
 ; SKL-NEXT:    # kill: def $eax killed $eax def $rax
 ; SKL-NEXT:    retq
 ;
+; GMT-LABEL: div64_hugews:
+; GMT:       # %bb.0:
+; GMT-NEXT:    movq %rdi, %rax
+; GMT-NEXT:    cqto
+; GMT-NEXT:    idivq %rsi
+; GMT-NEXT:    retq
+;
 ; HUGEWS-LABEL: div64_hugews:
 ; HUGEWS:       # %bb.0:
 ; HUGEWS-NEXT:    movq %rdi, %rax

From f95710c76519c611868c16f92586b6d0baedad54 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 11 Mar 2024 23:09:44 -0700
Subject: [PATCH 78/95] [flang] Fixed compiler build on glibc 2.17 systems
 after 3149c93. (#84873)

---
 flang/include/flang/Evaluate/integer.h   | 4 ++++
 flang/include/flang/Evaluate/real.h      | 4 ++++
 flang/lib/Evaluate/fold-implementation.h | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index 31768c21daae6..7395645701265 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -27,6 +27,10 @@
 #include <string>
 #include <type_traits>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace Fortran::evaluate::value {
 
 // Implements an integer as an assembly of smaller host integer parts
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index 5266bd0ef64bf..d0da9634651f3 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -18,6 +18,10 @@
 #include <limits>
 #include <string>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace llvm {
 class raw_ostream;
 }
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 6b3c9416724cb..9dd8c3843465d 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -39,6 +39,10 @@
 #include <type_traits>
 #include <variant>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace Fortran::evaluate {
 
 // Utilities

From 1d900e298449d43547312364751f730b7a0d07d1 Mon Sep 17 00:00:00 2001
From: "Dhruv Chawla (work)" <dhruvc@nvidia.com>
Date: Tue, 12 Mar 2024 11:57:07 +0530
Subject: [PATCH 79/95] [AArch64][GlobalISel] Avoid generating inserts for
 undefs when selecting G_BUILD_VECTOR (#84452)

It is safe to ignore undef values when selecting G_BUILD_VECTOR as undef
values choose random registers for copying values from.
---
 .../GISel/AArch64InstructionSelector.cpp      |  33 +-
 .../GlobalISel/select-build-vector.mir        |   6 +-
 .../select-shufflevec-undef-mask-elt.mir      |  18 +-
 llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll  |   1 -
 llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll  |   1 -
 llvm/test/CodeGen/AArch64/abs.ll              |   6 -
 llvm/test/CodeGen/AArch64/arm64-dup.ll        |   6 +-
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  45 +--
 llvm/test/CodeGen/AArch64/bitcast.ll          |   2 -
 llvm/test/CodeGen/AArch64/bswap.ll            |   1 -
 llvm/test/CodeGen/AArch64/fabs.ll             |  26 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |  62 ++--
 llvm/test/CodeGen/AArch64/fcmp.ll             | 286 ++++++++----------
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   4 -
 llvm/test/CodeGen/AArch64/fcvt.ll             | 182 +++++------
 llvm/test/CodeGen/AArch64/fdiv.ll             |  31 +-
 llvm/test/CodeGen/AArch64/fexplog.ll          |  10 -
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  | 118 ++++----
 llvm/test/CodeGen/AArch64/fminmax.ll          | 118 ++++----
 llvm/test/CodeGen/AArch64/fmla.ll             | 164 +++++-----
 llvm/test/CodeGen/AArch64/fmul.ll             |  31 +-
 llvm/test/CodeGen/AArch64/fneg.ll             |  26 +-
 llvm/test/CodeGen/AArch64/fpext.ll            |   2 -
 llvm/test/CodeGen/AArch64/fpow.ll             |   2 -
 llvm/test/CodeGen/AArch64/fpowi.ll            |   2 -
 llvm/test/CodeGen/AArch64/fptoi.ll            |  20 --
 llvm/test/CodeGen/AArch64/fptrunc.ll          |   8 -
 llvm/test/CodeGen/AArch64/frem.ll             |   2 -
 llvm/test/CodeGen/AArch64/fsincos.ll          |   4 -
 llvm/test/CodeGen/AArch64/fsqrt.ll            |  18 +-
 llvm/test/CodeGen/AArch64/icmp.ll             |  14 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |   4 -
 llvm/test/CodeGen/AArch64/itofp.ll            |  60 ----
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  11 +-
 llvm/test/CodeGen/AArch64/load.ll             |   4 -
 llvm/test/CodeGen/AArch64/sext.ll             |   7 -
 llvm/test/CodeGen/AArch64/shift.ll            |  33 --
 llvm/test/CodeGen/AArch64/shufflevector.ll    |  26 +-
 llvm/test/CodeGen/AArch64/xtn.ll              |   3 -
 llvm/test/CodeGen/AArch64/zext.ll             |  13 +-
 40 files changed, 545 insertions(+), 865 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0f3c3cb96e6ce..7a49422c064b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5934,13 +5934,16 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
 
   // Keep track of the last MI we inserted. Later on, we might be able to save
   // a copy using it.
-  MachineInstr *PrevMI = nullptr;
+  MachineInstr *PrevMI = ScalarToVec;
   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
     // Note that if we don't do a subregister copy, we can end up making an
     // extra register.
-    PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
-                              i - 1, RB, MIB);
-    DstVec = PrevMI->getOperand(0).getReg();
+    Register OpReg = I.getOperand(i).getReg();
+    // Do not emit inserts for undefs
+    if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
+      PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
+      DstVec = PrevMI->getOperand(0).getReg();
+    }
   }
 
   // If DstTy's size in bits is less than 128, then emit a subregister copy
@@ -5973,11 +5976,27 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
     RegOp.setReg(Reg);
     RBI.constrainGenericRegister(DstReg, *RC, MRI);
   } else {
-    // We don't need a subregister copy. Save a copy by re-using the
-    // destination register on the final insert.
-    assert(PrevMI && "PrevMI was null?");
+    // We either have a vector with all elements (except the first one) undef or
+    // at least one non-undef non-first element. In the first case, we need to
+    // constrain the output register ourselves as we may have generated an
+    // INSERT_SUBREG operation which is a generic operation for which the
+    // output regclass cannot be automatically chosen.
+    //
+    // In the second case, there is no need to do this as it may generate an
+    // instruction like INSvi32gpr where the regclass can be automatically
+    // chosen.
+    //
+    // Also, we save a copy by re-using the destination register on the final
+    // insert.
     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
+
+    Register DstReg = PrevMI->getOperand(0).getReg();
+    if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
+      const TargetRegisterClass *RC =
+          getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
+      RBI.constrainGenericRegister(DstReg, *RC, MRI);
+    }
   }
 
   I.eraseFromParent();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
index 5de97256fc85a..71a2bd2ddcc6e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
@@ -266,12 +266,8 @@ body:             |
     ; CHECK-LABEL: name: undef_elts_different_regbanks
     ; CHECK: liveins: $w0
     ; CHECK: %val:gpr32all = COPY $w0
-    ; CHECK: %undef:gpr32 = IMPLICIT_DEF
     ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
-    ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %undef
-    ; CHECK: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr]], 2, %undef
-    ; CHECK: %bv:fpr128 = INSvi32gpr [[INSvi32gpr1]], 3, %undef
+    ; CHECK: %bv:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
     ; CHECK: $q0 = COPY %bv
     ; CHECK: RET_ReallyLR implicit $q0
     %val:gpr(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
index 6e01723f49935..5f280ae2e3024 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
@@ -19,20 +19,18 @@ body:             |
     ; CHECK: liveins: $d0
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
     ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-    ; CHECK: [[DEF1:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[DEF]], %subreg.ssub
-    ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, [[DEF1]]
-    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.ssub
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSERT_SUBREG]].dsub
     ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
     ; CHECK: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
     ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.dsub
-    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[COPY1]], %subreg.dsub
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY1]], %subreg.dsub
     ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG2]], 0
-    ; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[LDRDui]], %subreg.dsub
+    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[LDRDui]], %subreg.dsub
     ; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[INSERT_SUBREG3]]
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
     ; CHECK: $d0 = COPY [[COPY2]]
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 273bf559554c9..f47da47002fbc 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -77,7 +77,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    and w8, w8, w10
 ; CHECK-GI-NEXT:    orr w8, w9, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index a92ae39c69724..5c006508d284f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -79,7 +79,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    bic w8, w10, w8
 ; CHECK-GI-NEXT:    orr w8, w9, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %and = and <1 x i32> %C, %B
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index f2cad6631dc26..e00f70b94e3b4 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -252,7 +252,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    eor w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -308,11 +307,6 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
 ; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w2
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    abs v0.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2112944cc8479..2bf5419e54830 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -373,11 +373,9 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ;
 ; CHECK-GI-LABEL: test_build_illegal:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.h v1[1], v0[0]
 ; CHECK-GI-NEXT:    mov s0, v0[3]
-; CHECK-GI-NEXT:    mov.h v1[2], v0[0]
-; CHECK-GI-NEXT:    mov.h v1[3], v0[0]
-; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    mov.h v0[3], v0[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %val = extractelement <4 x i32> %in, i32 3
   %smallval = trunc i32 %val to i16
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index cc3d80008143c..d282bee81827f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1346,7 +1346,6 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 ; CHECK-GI-LABEL: scalar_to_vector.v2i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -1354,33 +1353,19 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ret
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i64:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v2i64:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
   %b = insertelement <2 x i64> undef, i64 %a, i32 0
   ret <2 x i64> %b
 }
@@ -1900,14 +1885,6 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[10], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[11], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[12], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[13], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v0.b[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2123,10 +2100,6 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI131_0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[5], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2266,8 +2239,6 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index a5551285f2788..bccfdb93d786f 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -21,7 +21,6 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #58712 // =0xe558
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -42,7 +41,6 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #712 // =0x2c8
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9b065accce914..f4221accfcbc5 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -137,7 +137,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 7c13b49246d23..de108b0bc2b7a 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -160,21 +160,20 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -183,7 +182,6 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index f8970dc9e8d5d..6913a62fb266c 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -186,26 +186,24 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
@@ -217,7 +215,6 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
@@ -538,26 +535,24 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsub v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v2.4s, v3.4s
@@ -569,7 +564,6 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 0f02784aaf32a..2d0b5574cdd7b 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -262,31 +262,28 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ;
 ; CHECK-GI-LABEL: v3f64_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    fmov s16, w8
+; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    cset w9, mi
-; CHECK-GI-NEXT:    mov v16.s[1], w8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    fmov d2, x9
 ; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    mov v1.d[1], x8
-; CHECK-GI-NEXT:    mov v16.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mov v16.s[3], w8
+; CHECK-GI-NEXT:    neg v3.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov v2.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v16.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v16.4s
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v3.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
@@ -349,15 +346,13 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -429,15 +424,13 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -554,44 +547,40 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
 ; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
 ; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.16b, v4.16b
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v18.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -602,6 +591,7 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fmov s4, w8
 ; CHECK-GI-FP16-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-GI-FP16-NEXT:    fmov s6, w8
@@ -619,11 +609,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[7], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-FP16-NEXT:    mov v7.h[7], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
 ; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -1054,69 +1041,63 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #40]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w4
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp]
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w7
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    ldr s2, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s16
-; CHECK-GI-NOFP16-NEXT:    fcmgt v5.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w0
-; CHECK-GI-NOFP16-NEXT:    neg v19.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[2], v18.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    ushl v4.4s, v5.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s5
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    sshl v4.4s, v4.4s, v19.4s
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v4.16b, v3.16b
-; CHECK-GI-NOFP16-NEXT:    and v3.16b, v17.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v6.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    neg v18.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v17.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v4.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w4
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
 ; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
 ; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
@@ -1127,65 +1108,60 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v5.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w10
-; CHECK-GI-FP16-NEXT:    ldr s1, [sp, #24]
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #32]
-; CHECK-GI-FP16-NEXT:    fmov s16, w0
-; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #40]
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #8]
-; CHECK-GI-FP16-NEXT:    umov w8, v5.h[4]
-; CHECK-GI-FP16-NEXT:    umov w9, v5.h[5]
-; CHECK-GI-FP16-NEXT:    umov w11, v5.h[0]
-; CHECK-GI-FP16-NEXT:    umov w12, v5.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v16.s[1], w1
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], v17.s[0]
-; CHECK-GI-FP16-NEXT:    fmov s3, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v5.h[6]
-; CHECK-GI-FP16-NEXT:    fmov s0, w11
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
-; CHECK-GI-FP16-NEXT:    umov w10, v5.h[3]
-; CHECK-GI-FP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w9
-; CHECK-GI-FP16-NEXT:    umov w9, v5.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w12
-; CHECK-GI-FP16-NEXT:    fmov s5, w4
-; CHECK-GI-FP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fmov s6, w0
+; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
+; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
+; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
+; CHECK-GI-FP16-NEXT:    fmov s3, w9
+; CHECK-GI-FP16-NEXT:    fmov s0, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-FP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w9
-; CHECK-GI-FP16-NEXT:    fmov s4, w8
-; CHECK-GI-FP16-NEXT:    mov v2.s[3], w8
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w5
-; CHECK-GI-FP16-NEXT:    fmov w9, s6
-; CHECK-GI-FP16-NEXT:    fmov s6, w7
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    fmov s4, w7
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w10
-; CHECK-GI-FP16-NEXT:    mov v6.s[1], w9
-; CHECK-GI-FP16-NEXT:    neg v18.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s7
+; CHECK-GI-FP16-NEXT:    fmov w8, s5
+; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
 ; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    ldr s3, [sp, #16]
+; CHECK-GI-FP16-NEXT:    fmov s3, w4
+; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
+; CHECK-GI-FP16-NEXT:    fmov w8, s5
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
 ; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s3
-; CHECK-GI-FP16-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[3], w8
-; CHECK-GI-FP16-NEXT:    eor v3.16b, v2.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v5.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v16.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 78fd38ca9f268..84376107679d8 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -162,8 +162,6 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-NEXT:    mov v2.s[2], w9
 ; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -223,8 +221,6 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    mov v5.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v4.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v5.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NEXT:    mov v5.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v5.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 3b8a22a052b83..1c761ea083028 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -163,21 +163,20 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -186,7 +185,6 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
@@ -470,21 +468,20 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -493,7 +490,6 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
@@ -777,21 +773,20 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -800,7 +795,6 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
@@ -1084,21 +1078,20 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1107,7 +1100,6 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
@@ -1391,21 +1383,20 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1414,7 +1405,6 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
@@ -1698,21 +1688,20 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1721,7 +1710,6 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
@@ -2005,21 +1993,20 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -2028,7 +2015,6 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index e73124fbb595b..d73a5dc73eefc 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -186,25 +186,23 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fdiv v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
@@ -217,7 +215,6 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index e3c0ced79f07a..519a2978d8604 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -332,7 +332,6 @@ define <3 x float> @exp_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -703,7 +702,6 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -1591,7 +1589,6 @@ define <3 x float> @exp2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -1962,7 +1959,6 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -2850,7 +2846,6 @@ define <3 x float> @log_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -3221,7 +3216,6 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -4109,7 +4103,6 @@ define <3 x float> @log2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -4480,7 +4473,6 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -5368,7 +5360,6 @@ define <3 x float> @log10_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -5739,7 +5730,6 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index f0e946c139987..357d91960624b 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -334,41 +334,39 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: min_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.minimum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -415,41 +413,39 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: max_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.maximum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -666,26 +662,24 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmin v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v2.4s, v3.4s
@@ -697,7 +691,6 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -775,26 +768,24 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmax v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v2.4s, v3.4s
@@ -806,7 +797,6 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index cdf9973b49f46..61199f82615bb 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -334,41 +334,39 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: min_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.minnum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -415,41 +413,39 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: max_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.maxnum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -666,26 +662,24 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fminnm v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v2.4s, v3.4s
@@ -697,7 +691,6 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -775,26 +768,24 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
@@ -806,7 +797,6 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 336c9705f399d..4b019b57d968d 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -254,35 +254,32 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fma_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h18, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmla v19.4s, v18.4s, v17.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v19.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v16.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v18.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmla v4.4s, v3.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[0]
@@ -293,7 +290,6 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
@@ -866,43 +862,40 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v4.4s, v7.4s, v16.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
@@ -911,7 +904,6 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
@@ -1368,43 +1360,40 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v4.4s, v7.4s, v16.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
@@ -1413,7 +1402,6 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 1f49601a18272..1f41f2385c335 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -186,26 +186,24 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v2.4s, v3.4s
@@ -217,7 +215,6 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index d5010cf360841..cc0f7d2fd6075 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -161,21 +161,20 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -184,7 +183,6 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index 86f7322f7c4ee..24a2451df4842 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -168,8 +168,6 @@ define <2 x float> @fpext_v2f16_v2f32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 1dd5450c271cb..c2ad1aafd65fc 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -395,7 +395,6 @@ define <3 x float> @pow_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
@@ -856,7 +855,6 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index b496c7d15eef3..5dbcaa4a5fda1 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -370,7 +370,6 @@ define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -787,7 +786,6 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index facb89671056f..67190e8596c46 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2708,7 +2708,6 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2730,7 +2729,6 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3243,8 +3241,6 @@ define <2 x i64> @fptos_v2f16_v2i64(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2d, v0.2d
@@ -3292,8 +3288,6 @@ define <2 x i64> @fptou_v2f16_v2i64(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2d, v0.2d
@@ -4996,8 +4990,6 @@ define <2 x i32> @fptos_v2f16_v2i32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NEXT:    ret
@@ -5019,8 +5011,6 @@ define <2 x i32> @fptou_v2f16_v2i32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NEXT:    ret
@@ -5276,8 +5266,6 @@ define <2 x i16> @fptos_v2f16_v2i16(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5306,8 +5294,6 @@ define <2 x i16> @fptou_v2f16_v2i16(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5344,7 +5330,6 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5378,7 +5363,6 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5756,8 +5740,6 @@ define <2 x i8> @fptos_v2f16_v2i8(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5786,8 +5768,6 @@ define <2 x i8> @fptou_v2f16_v2i8(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 3efc98ab5fd53..9d0672d1c95ea 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -63,7 +63,6 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptrunc <3 x double> %a to <3 x float>
@@ -94,8 +93,6 @@ define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h0, d0
 ; CHECK-GI-NEXT:    fcvt h1, d1
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -121,7 +118,6 @@ define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h2, d2
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -167,13 +163,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 03caf0a33eb45..1a10fd2f1cdc3 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -397,7 +397,6 @@ define <3 x float> @frem_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
@@ -858,7 +857,6 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2c76d969d6efe..2ab1610edcc7f 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -332,7 +332,6 @@ define <3 x float> @sin_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -703,7 +702,6 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -1591,7 +1589,6 @@ define <3 x float> @cos_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -1962,7 +1959,6 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 683544a69ebe1..4b48bcc5508db 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -195,17 +195,16 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
@@ -218,7 +217,6 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 2e8c93a00a0d8..e7352fe03d01a 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -177,15 +177,13 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index b0df5cb3d8371..5c2dd761bdc0d 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -233,7 +233,6 @@ define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -254,7 +253,6 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 2
@@ -766,7 +764,6 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 0
@@ -785,7 +782,6 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 708bb43887f86..2164c2aad2011 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -2605,7 +2605,6 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x float>
@@ -2638,7 +2637,6 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x float>
@@ -3754,13 +3752,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -3771,8 +3765,6 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3809,13 +3801,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -3826,8 +3814,6 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3876,7 +3862,6 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3925,7 +3910,6 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -4756,13 +4740,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4783,13 +4763,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4997,13 +4973,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5012,13 +4984,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5048,13 +5016,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5063,13 +5027,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5551,13 +5511,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5566,19 +5522,13 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5622,13 +5572,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5638,13 +5584,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5694,7 +5636,6 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
@@ -5744,7 +5685,6 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    movi d1, #0xff00ff00ff00ff
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 70df88ba9f898..56f4272c4363c 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -109,14 +109,11 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) {
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; GISEL-NEXT:    fcvt h1, s0
+; GISEL-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
 ; GISEL-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v1.h[1], v0.h[0]
-; GISEL-NEXT:    mov v1.h[2], v0.h[0]
-; GISEL-NEXT:    mov v1.h[3], v0.h[0]
-; GISEL-NEXT:    mov v0.16b, v1.16b
+; GISEL-NEXT:    mov v0.h[1], v1.h[0]
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; GISEL-NEXT:    add sp, sp, #32
 ; GISEL-NEXT:    ret
@@ -196,7 +193,6 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; GISEL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; GISEL-NEXT:    mov v1.h[1], v2.h[0]
 ; GISEL-NEXT:    mov v1.h[2], v0.h[0]
-; GISEL-NEXT:    mov v1.h[3], v0.h[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; GISEL-NEXT:    add sp, sp, #64
@@ -440,7 +436,6 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) {
 ; GISEL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; GISEL-NEXT:    mov v1.s[1], v2.s[0]
 ; GISEL-NEXT:    mov v1.s[2], v0.s[0]
-; GISEL-NEXT:    mov v1.s[3], v0.s[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b
 ; GISEL-NEXT:    add sp, sp, #64
 ; GISEL-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 7f4540d915ab3..39143e5c53ffd 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -245,7 +245,6 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #6]
 ; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i8>, ptr %ptr
@@ -265,7 +264,6 @@ define <3 x i16> @load_v3i16(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    ldr h1, [x0, #4]
 ; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i16>, ptr %ptr
@@ -293,7 +291,6 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
 ; CHECK-GI-NEXT:    ldr h1, [x0, #12]
 ; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
@@ -311,7 +308,6 @@ define <3 x i32> @load_v3i32(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [x0, #8]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i32>, ptr %ptr
     ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index f319721e0f2f0..61f04fbf0484f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -222,7 +222,6 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov v0.s[1], w1
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
@@ -252,8 +251,6 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v1.s[2], w2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
@@ -315,7 +312,6 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    smov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -390,7 +386,6 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov v0.s[1], w1
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #6
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #6
@@ -420,8 +415,6 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v1.s[2], w2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index ccc06f2e1058d..5287839ee7b70 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -594,7 +594,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <1 x i32> %0, %1
@@ -697,7 +696,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <1 x i32> %0, %1
@@ -790,7 +788,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <1 x i32> %0, %1
@@ -851,16 +848,6 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    fmov s3, w5
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
 ; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
@@ -937,16 +924,6 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    fmov s2, w2
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
@@ -1027,16 +1004,6 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    fmov s2, w2
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b408bc1c38976..d79f3ae11167f 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -210,8 +210,8 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: shufflevector_v4i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
@@ -224,14 +224,6 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -287,16 +279,12 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: shufflevector_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -516,16 +504,6 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
 ; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 21982fadbe803..3c86f4bf9eb21 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -298,7 +298,6 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -327,7 +326,6 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -353,7 +351,6 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <3 x i64> %a to <3 x i32>
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index e513340f5b18a..54b29be2132cd 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -249,10 +249,8 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -280,8 +278,6 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -341,7 +337,6 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    umov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -420,10 +415,8 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -451,8 +444,6 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:

From 1dd104db59d145d516a5e9cbb081ed01262961ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 12 Mar 2024 07:47:48 +0100
Subject: [PATCH 80/95] [clang][Interp] Implement _Complex Not unary operators

This only happens in C as far as I can tell. The complex varialbe
will have undergone a conversion to bool in C++ before reaching
the unary operator.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 11 +++++++++++
 clang/test/AST/Interp/complex.c          |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 0dd645990d1d5..da4a8f88f1396 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -3182,6 +3182,17 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
   case UO_AddrOf:
     return this->delegate(SubExpr);
 
+  case UO_LNot:
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitComplexBoolCast(SubExpr))
+      return false;
+    if (!this->emitInvBool(E))
+      return false;
+    if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool)
+      return this->emitCast(PT_Bool, ET, E);
+    return true;
+
   case UO_Real:
     return this->emitComplexReal(SubExpr);
 
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
index c9c2efb597453..b5f30b87baa79 100644
--- a/clang/test/AST/Interp/complex.c
+++ b/clang/test/AST/Interp/complex.c
@@ -14,3 +14,8 @@ void blah() {
 _Static_assert((0.0 + 0.0j) == (0.0 + 0.0j), "");
 _Static_assert((0.0 + 0.0j) != (0.0 + 0.0j), ""); // both-error {{static assertion}} \
                                                   // both-note {{evaluates to}}
+
+const _Complex float FC = {0.0f, 0.0f};
+_Static_assert(!FC, "");
+const _Complex float FI = {0, 0};
+_Static_assert(!FI, "");

From 103469b5f7467d5df15799c2d8ad150729bc33bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 12 Mar 2024 08:50:51 +0100
Subject: [PATCH 81/95] [clang][Interp] Implement more easy _Complex unary
 operators

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 25 ++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index da4a8f88f1396..86304a54473ce 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -3138,16 +3138,17 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
     return this->discard(SubExpr);
 
   std::optional<PrimType> ResT = classify(E);
+  auto prepareResult = [=]() -> bool {
+    if (!ResT && !Initializing) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(SubExpr, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      return this->emitGetPtrLocal(*LocalIndex, E);
+    }
 
-  // Prepare storage for result.
-  if (!ResT && !Initializing) {
-    std::optional<unsigned> LocalIndex =
-        allocateLocal(SubExpr, /*IsExtended=*/false);
-    if (!LocalIndex)
-      return false;
-    if (!this->emitGetPtrLocal(*LocalIndex, E))
-      return false;
-  }
+    return true;
+  };
 
   // The offset of the temporary, if we created one.
   unsigned SubExprOffset = ~0u;
@@ -3167,6 +3168,8 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
 
   switch (E->getOpcode()) {
   case UO_Minus:
+    if (!prepareResult())
+      return false;
     if (!createTemp())
       return false;
     for (unsigned I = 0; I != 2; ++I) {
@@ -3179,7 +3182,9 @@ bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
     }
     break;
 
-  case UO_AddrOf:
+  case UO_Plus:   // +x
+  case UO_AddrOf: // &x
+  case UO_Deref:  // *x
     return this->delegate(SubExpr);
 
   case UO_LNot:

From 85f6669de59b2bb75c6848afa79de63be988721c Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Tue, 12 Mar 2024 09:04:25 +0100
Subject: [PATCH 82/95] [flang] implement sizeof lowering for polymorphic
 entities (#84498)

For non polymorphic entities, semantics knows the type size and rewrite
sizeof to `"cst element size" * size(x)`.

Lowering has to deal with the polymorphic case where the type size must
be retrieved from the descriptor (note that the lowering implementation
would work with any entity, polymorphic on not, it is just not used for
the non polymorphic cases).
---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  1 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 18 +++++++++++++++
 flang/test/Lower/Intrinsics/sizeof.f90        | 23 +++++++++++++++++++
 3 files changed, 42 insertions(+)
 create mode 100644 flang/test/Lower/Intrinsics/sizeof.f90

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 7cb99d61a686e..ca15b4bc34b29 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -338,6 +338,7 @@ struct IntrinsicLibrary {
   mlir::Value genSign(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genSize(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genSizeOf(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genSpacing(mlir::Type resultType,
                          llvm::ArrayRef<mlir::Value> args);
   fir::ExtendedValue genSpread(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2f7ace658e475..ca5ab6fcea342 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -567,6 +567,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asAddr, handleDynamicOptional},
        {"kind", asValue}}},
      /*isElemental=*/false},
+    {"sizeof",
+     &I::genSizeOf,
+     {{{"a", asBox}}},
+     /*isElemental=*/false},
     {"sleep", &I::genSleep, {{{"seconds", asValue}}}, /*isElemental=*/false},
     {"spacing", &I::genSpacing},
     {"spread",
@@ -5946,6 +5950,20 @@ IntrinsicLibrary::genSize(mlir::Type resultType,
       .getResults()[0];
 }
 
+// SIZEOF
+fir::ExtendedValue
+IntrinsicLibrary::genSizeOf(mlir::Type resultType,
+                            llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value box = fir::getBase(args[0]);
+  mlir::Value eleSize = builder.create<fir::BoxEleSizeOp>(loc, resultType, box);
+  if (!fir::isArray(args[0]))
+    return eleSize;
+  mlir::Value arraySize = builder.createConvert(
+      loc, resultType, fir::runtime::genSize(builder, loc, box));
+  return builder.create<mlir::arith::MulIOp>(loc, eleSize, arraySize);
+}
+
 // TAND
 mlir::Value IntrinsicLibrary::genTand(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
new file mode 100644
index 0000000000000..959ca1692b514
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -0,0 +1,23 @@
+! Test SIZEOF lowering for polymorphic entities.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+
+integer(8) function test1(x)
+  class(*) :: x
+  test1 = sizeof(x)
+end function
+! CHECK-LABEL:   func.func @_QPtest1(
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<none>) -> i64
+! CHECK:           hlfir.assign %[[VAL_4]] to %{{.*}} : i64, !fir.ref<i64>
+
+integer(8) function test2(x)
+  class(*) :: x(:, :)
+  test2 = sizeof(x)
+end function
+! CHECK-LABEL:   func.func @_QPtest2(
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
+! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranASize(%[[VAL_7]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i64
+! CHECK:           %[[VAL_10:.*]] = arith.muli %[[VAL_4]], %[[VAL_9]] : i64
+! CHECK:           hlfir.assign %[[VAL_10]] to %{{.*}} : i64, !fir.ref<i64>

From 8e0f4b943fee13afc970ca8277a8e76b9da63b96 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 12 Mar 2024 09:12:44 +0100
Subject: [PATCH 83/95] [NVPTX] Add support for atomic add for f16 type
 (#84295)

atom.add.noftz.f16 is supported since SM 7.0
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |   3 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  15 +++
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll     | 121 ++++++++++++++++++++
 llvm/test/CodeGen/NVPTX/atomics.ll          |   7 ++
 4 files changed, 146 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/atomics-sm70.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c979c03dc1b83..c411c8ef9528d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6100,6 +6100,9 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   if (AI->isFloatingPointOperation()) {
     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
+      if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
+          STI.getPTXVersion() >= 63)
+        return AtomicExpansionKind::None;
       if (Ty->isFloatTy())
         return AtomicExpansionKind::None;
       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 477789a164ead..869b13369e87e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1630,6 +1630,13 @@ defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
   ".add", atomic_load_add_64_gen, i64imm, imm>;
 
+defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
+  atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz",
+  atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
+  atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
   atomic_load_add_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
@@ -2007,6 +2014,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
                        SDNode Imm, ValueType ImmTy,
                        list<Predicate> Preds> {
   let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                      (ins Int16Regs:$src, regclass:$b),
+                      (Intr (i16 Int16Regs:$src), (regT regclass:$b))>;
     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                       (ins Int32Regs:$src, regclass:$b),
                       (Intr (i32 Int32Regs:$src), (regT regclass:$b))>;
@@ -2017,6 +2027,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
   // tablegen can't infer argument types from Intrinsic (though it can
   // from Instruction) so we have to enforce specific type on
   // immediates via explicit cast to ImmTy.
+  def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                    (ins Int16Regs:$src, ImmType:$b),
+                    (Intr (i16 Int16Regs:$src), (ImmTy Imm:$b))>;
   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                     (ins Int32Regs:$src, ImmType:$b),
                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b))>;
@@ -2136,6 +2149,8 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
+   defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
+                            [hasSM<70>, hasPTX<63>]>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
                             []>;
    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64,
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
new file mode 100644
index 0000000000000..f68bb2049d006
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK-NEXT:    atom.add.noftz.f16 %rs2, [%r1], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    atom.global.add.noftz.f16 %rs3, [%r2], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    atom.shared.add.noftz.f16 %rs4, [%r3], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK64-LABEL: test(
+; CHECK64:       {
+; CHECK64-NEXT:    .reg .b16 %rs<5>;
+; CHECK64-NEXT:    .reg .b64 %rd<4>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT:  // %bb.0:
+; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    atom.global.add.noftz.f16 %rs3, [%rd2], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    atom.shared.add.noftz.f16 %rs4, [%rd3], %rs1;
+; CHECK64-NEXT:    ret;
+;
+; CHECKPTX62-LABEL: test(
+; CHECKPTX62:       {
+; CHECKPTX62-NEXT:    .reg .pred %p<4>;
+; CHECKPTX62-NEXT:    .reg .b16 %rs<14>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<49>;
+; CHECKPTX62-EMPTY:
+; CHECKPTX62-NEXT:  // %bb.0:
+; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX62-NEXT:    ld.param.u32 %r20, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.u32 %r19, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.u32 %r21, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r21, -4;
+; CHECKPTX62-NEXT:    and.b32 %r22, %r21, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r22, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r23, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r24, %r23, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r24;
+; CHECKPTX62-NEXT:    ld.u32 %r46, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r25, %r46, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r25;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs4, %rs2, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r26, %rs4;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r28, %r46, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r6, [%r1], %r46, %r29;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p1, %r6, %r46;
+; CHECKPTX62-NEXT:    mov.u32 %r46, %r6;
+; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX62-NEXT:    and.b32 %r7, %r19, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r19, 3;
+; CHECKPTX62-NEXT:    and.b32 %r8, %r30, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r32, %r23, %r8;
+; CHECKPTX62-NEXT:    not.b32 %r9, %r32;
+; CHECKPTX62-NEXT:    ld.global.u32 %r47, [%r7];
+; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r47, %r8;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs6, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r8;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r47, %r9;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.global.cas.b32 %r12, [%r7], %r47, %r37;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p2, %r12, %r47;
+; CHECKPTX62-NEXT:    mov.u32 %r47, %r12;
+; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX62-NEXT:    and.b32 %r13, %r20, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r20, 3;
+; CHECKPTX62-NEXT:    and.b32 %r14, %r38, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r23, %r14;
+; CHECKPTX62-NEXT:    not.b32 %r15, %r40;
+; CHECKPTX62-NEXT:    ld.shared.u32 %r48, [%r13];
+; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r48, %r14;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs10, %r41;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs12, %rs10, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs12;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r14;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r48, %r15;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.shared.cas.b32 %r18, [%r13], %r48, %r45;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p3, %r18, %r48;
+; CHECKPTX62-NEXT:    mov.u32 %r48, %r18;
+; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX62-NEXT:    ret;
+  %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
+  %r2 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+  ret void
+}
+
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index e99d0fd05e346..6f2b5dcf47f13 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -175,6 +175,13 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
   ret float %ret
 }
 
+; CHECK-LABEL: atomicrmw_add_f16_generic
+define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
+; CHECK: atom.cas
+  %ret = atomicrmw fadd ptr %addr, half %val seq_cst
+  ret half %ret
+}
+
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace1
 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK: atom.global.add.f32

From 36dece001325bbf00129c48ddb3c83668b0ac36e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 12 Mar 2024 08:20:08 +0000
Subject: [PATCH 84/95] [AMDGPU] Add missing GFX10 buffer format d16 hi
 instructions (#84809)

---
 llvm/lib/Target/AMDGPU/BUFInstructions.td        | 5 ++---
 llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s            | 6 ++++++
 llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt | 6 ++++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a1bbe170ee29a..c7091028b3b5e 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2691,9 +2691,8 @@ defm BUFFER_LOAD_SBYTE_D16        : MUBUF_Real_AllAddr_gfx10<0x022>;
 defm BUFFER_LOAD_SBYTE_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x023>;
 defm BUFFER_LOAD_SHORT_D16        : MUBUF_Real_AllAddr_gfx10<0x024>;
 defm BUFFER_LOAD_SHORT_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x025>;
-// FIXME-GFX10: Add following instructions:
-//defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_gfx10<0x026>;
-//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_gfx10<0x026>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
 defm BUFFER_LOAD_FORMAT_D16_X     : MUBUF_Real_AllAddr_gfx10<0x080>;
 defm BUFFER_LOAD_FORMAT_D16_XY    : MUBUF_Real_AllAddr_gfx10<0x081>;
 defm BUFFER_LOAD_FORMAT_D16_XYZ   : MUBUF_Real_AllAddr_gfx10<0x082>;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
index aacdfcb4e871e..b77f8e0a31927 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
@@ -17,6 +17,9 @@ buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1
 buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1
 // GFX10: encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
 
+buffer_load_format_d16_hi_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01]
+
 buffer_load_format_x v5, off, s[8:11], s3 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
@@ -245,6 +248,9 @@ buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1
 buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1
 // GFX10: encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
 
+buffer_store_format_d16_hi_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01]
+
 buffer_store_format_x v1, off, s[12:15], s4 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
index b0731be4484c7..849c89e37011f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
@@ -1328,6 +1328,9 @@
 # GFX10: buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
 0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01
 
+# GFX10: buffer_load_format_d16_hi_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01]
+0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_load_format_x v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03]
 0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03
 
@@ -2039,6 +2042,9 @@
 # GFX10: buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
 0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01
 
+# GFX10: buffer_store_format_d16_hi_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01]
+0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_store_format_x v1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1]
 0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1
 

From 6bbe8a296ee91754d423c59c35727eaa624f7140 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 12 Mar 2024 01:24:21 -0700
Subject: [PATCH 85/95] [llvm-exegesis] Add thread IDs to subprocess memory
 names (#84451)

This patch adds the thread ID to the subprocess memory shared memory
names. This avoids conflicts for downstream consumers that might want to
consume llvm-exegesis across multiple threads, which would otherwise run
into conflicts due to the same PID running multiple instances.
---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     |  9 +++---
 .../llvm-exegesis/lib/SubprocessMemory.cpp    | 28 +++++++++++++------
 .../llvm-exegesis/lib/SubprocessMemory.h      |  5 +++-
 .../X86/SubprocessMemoryTest.cpp              |  5 +++-
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 5c9848f3c6888..4e97d188d1725 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -301,6 +301,7 @@ class SubProcessFunctionExecutorImpl
     if (AddMemDefError)
       return AddMemDefError;
 
+    long ParentTID = SubprocessMemory::getCurrentTID();
     pid_t ParentOrChildPID = fork();
 
     if (ParentOrChildPID == -1) {
@@ -314,7 +315,7 @@ class SubProcessFunctionExecutorImpl
       // Unregister handlers, signal handling is now handled through ptrace in
       // the host process.
       sys::unregisterHandlers();
-      prepareAndRunBenchmark(PipeFiles[0], Key);
+      prepareAndRunBenchmark(PipeFiles[0], Key, ParentTID);
       // The child process terminates in the above function, so we should never
       // get to this point.
       llvm_unreachable("Child process didn't exit when expected.");
@@ -415,8 +416,8 @@ class SubProcessFunctionExecutorImpl
     setrlimit(RLIMIT_CORE, &rlim);
   }
 
-  [[noreturn]] void prepareAndRunBenchmark(int Pipe,
-                                           const BenchmarkKey &Key) const {
+  [[noreturn]] void prepareAndRunBenchmark(int Pipe, const BenchmarkKey &Key,
+                                           long ParentTID) const {
     // Disable core dumps in the child process as otherwise everytime we
     // encounter an execution failure like a segmentation fault, we will create
     // a core dump. We report the information directly rather than require the
@@ -473,7 +474,7 @@ class SubProcessFunctionExecutorImpl
 
     Expected<int> AuxMemFDOrError =
         SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
-            Key.MemoryValues, ParentPID, CounterFileDescriptor);
+            Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
     if (!AuxMemFDOrError)
       exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
 
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index a49fa077257d0..11ad72a914c4e 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -9,11 +9,13 @@
 #include "SubprocessMemory.h"
 #include "Error.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 #include <cerrno>
 
 #ifdef __linux__
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <sys/syscall.h>
 #include <unistd.h>
 #endif
 
@@ -22,12 +24,21 @@ namespace exegesis {
 
 #if defined(__linux__) && !defined(__ANDROID__)
 
+long SubprocessMemory::getCurrentTID() {
+  // We're using the raw syscall here rather than the gettid() function provided
+  // by most libcs for compatibility as gettid() was only added to glibc in
+  // version 2.30.
+  return syscall(SYS_gettid);
+}
+
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
   // This comes up particularly often when running the exegesis tests with
-  // llvm-lit
-  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ProcessID);
+  // llvm-lit. Additionally add the TID so that downstream consumers
+  // using multiple threads don't run into conflicts.
+  std::string AuxiliaryMemoryName =
+      formatv("/{0}auxmem{1}", getCurrentTID(), ProcessID);
   int AuxiliaryMemoryFD = shm_open(AuxiliaryMemoryName.c_str(),
                                    O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFD == -1)
@@ -47,8 +58,8 @@ Error SubprocessMemory::addMemoryDefinition(
     pid_t ProcessPID) {
   SharedMemoryNames.reserve(MemoryDefinitions.size());
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string SharedMemoryName = "/" + std::to_string(ProcessPID) + "memdef" +
-                                   std::to_string(MemVal.Index);
+    std::string SharedMemoryName =
+        formatv("/{0}t{1}memdef{2}", ProcessPID, getCurrentTID(), MemVal.Index);
     SharedMemoryNames.push_back(SharedMemoryName);
     int SharedMemoryFD =
         shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
@@ -82,8 +93,9 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, int CounterFileDescriptor) {
-  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ParentPID);
+    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
+  std::string AuxiliaryMemoryName =
+      formatv("/{0}auxmem{1}", ParentTID, ParentPID);
   int AuxiliaryMemoryFileDescriptor =
       shm_open(AuxiliaryMemoryName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFileDescriptor == -1)
@@ -97,8 +109,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     return make_error<Failure>("Mapping auxiliary memory failed");
   AuxiliaryMemoryMapping[0] = CounterFileDescriptor;
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string MemoryValueName = "/" + std::to_string(ParentPID) + "memdef" +
-                                  std::to_string(MemVal.Index);
+    std::string MemoryValueName =
+        formatv("/{0}t{1}memdef{2}", ParentPID, ParentTID, MemVal.Index);
     AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] =
         shm_open(MemoryValueName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
     if (AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] == -1)
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index e20b50cdc8118..572d1085d9cff 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -35,6 +35,9 @@ class SubprocessMemory {
   static constexpr const size_t AuxiliaryMemoryOffset = 1;
   static constexpr const size_t AuxiliaryMemorySize = 4096;
 
+  // Gets the thread ID for the calling thread.
+  static long getCurrentTID();
+
   Error initializeSubprocessMemory(pid_t ProcessID);
 
   // The following function sets up memory definitions. It creates shared
@@ -54,7 +57,7 @@ class SubprocessMemory {
   // section.
   static Expected<int> setupAuxiliaryMemoryInSubprocess(
       std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-      pid_t ParentPID, int CounterFileDescriptor);
+      pid_t ParentPID, long ParentTID, int CounterFileDescriptor);
 
   ~SubprocessMemory();
 
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index c07ec188a602c..7c23e7b7e9c5a 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -17,6 +17,7 @@
 #include <endian.h>
 #include <fcntl.h>
 #include <sys/mman.h>
+#include <sys/syscall.h>
 #include <unistd.h>
 #endif // __linux__
 
@@ -49,7 +50,9 @@ class SubprocessMemoryTest : public X86TestBase {
 
   std::string getSharedMemoryName(const unsigned TestNumber,
                                   const unsigned DefinitionNumber) {
-    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "memdef" +
+    long CurrentTID = syscall(SYS_gettid);
+    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "t" +
+           std::to_string(CurrentTID) + "memdef" +
            std::to_string(DefinitionNumber);
   }
 

From aefad27096bba513f06162fac2763089578f3de4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 12 Mar 2024 08:51:45 +0000
Subject: [PATCH 86/95] Revert "[llvm-exegesis] Add thread IDs to subprocess
 memory names (#84451)"

This reverts commit 6bbe8a296ee91754d423c59c35727eaa624f7140.

This breaks building LLVM on macOS, failing with

    llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp:146:33: error: out-of-line definition of 'setupAuxiliaryMemoryInSubprocess' does not match any declaration in 'llvm::exegesis::SubprocessMemory'
    Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     |  9 +++---
 .../llvm-exegesis/lib/SubprocessMemory.cpp    | 28 ++++++-------------
 .../llvm-exegesis/lib/SubprocessMemory.h      |  5 +---
 .../X86/SubprocessMemoryTest.cpp              |  5 +---
 4 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 4e97d188d1725..5c9848f3c6888 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -301,7 +301,6 @@ class SubProcessFunctionExecutorImpl
     if (AddMemDefError)
       return AddMemDefError;
 
-    long ParentTID = SubprocessMemory::getCurrentTID();
     pid_t ParentOrChildPID = fork();
 
     if (ParentOrChildPID == -1) {
@@ -315,7 +314,7 @@ class SubProcessFunctionExecutorImpl
       // Unregister handlers, signal handling is now handled through ptrace in
       // the host process.
       sys::unregisterHandlers();
-      prepareAndRunBenchmark(PipeFiles[0], Key, ParentTID);
+      prepareAndRunBenchmark(PipeFiles[0], Key);
       // The child process terminates in the above function, so we should never
       // get to this point.
       llvm_unreachable("Child process didn't exit when expected.");
@@ -416,8 +415,8 @@ class SubProcessFunctionExecutorImpl
     setrlimit(RLIMIT_CORE, &rlim);
   }
 
-  [[noreturn]] void prepareAndRunBenchmark(int Pipe, const BenchmarkKey &Key,
-                                           long ParentTID) const {
+  [[noreturn]] void prepareAndRunBenchmark(int Pipe,
+                                           const BenchmarkKey &Key) const {
     // Disable core dumps in the child process as otherwise everytime we
     // encounter an execution failure like a segmentation fault, we will create
     // a core dump. We report the information directly rather than require the
@@ -474,7 +473,7 @@ class SubProcessFunctionExecutorImpl
 
     Expected<int> AuxMemFDOrError =
         SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
-            Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
+            Key.MemoryValues, ParentPID, CounterFileDescriptor);
     if (!AuxMemFDOrError)
       exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
 
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 11ad72a914c4e..a49fa077257d0 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -9,13 +9,11 @@
 #include "SubprocessMemory.h"
 #include "Error.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FormatVariadic.h"
 #include <cerrno>
 
 #ifdef __linux__
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/syscall.h>
 #include <unistd.h>
 #endif
 
@@ -24,21 +22,12 @@ namespace exegesis {
 
 #if defined(__linux__) && !defined(__ANDROID__)
 
-long SubprocessMemory::getCurrentTID() {
-  // We're using the raw syscall here rather than the gettid() function provided
-  // by most libcs for compatibility as gettid() was only added to glibc in
-  // version 2.30.
-  return syscall(SYS_gettid);
-}
-
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
   // This comes up particularly often when running the exegesis tests with
-  // llvm-lit. Additionally add the TID so that downstream consumers
-  // using multiple threads don't run into conflicts.
-  std::string AuxiliaryMemoryName =
-      formatv("/{0}auxmem{1}", getCurrentTID(), ProcessID);
+  // llvm-lit
+  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ProcessID);
   int AuxiliaryMemoryFD = shm_open(AuxiliaryMemoryName.c_str(),
                                    O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFD == -1)
@@ -58,8 +47,8 @@ Error SubprocessMemory::addMemoryDefinition(
     pid_t ProcessPID) {
   SharedMemoryNames.reserve(MemoryDefinitions.size());
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string SharedMemoryName =
-        formatv("/{0}t{1}memdef{2}", ProcessPID, getCurrentTID(), MemVal.Index);
+    std::string SharedMemoryName = "/" + std::to_string(ProcessPID) + "memdef" +
+                                   std::to_string(MemVal.Index);
     SharedMemoryNames.push_back(SharedMemoryName);
     int SharedMemoryFD =
         shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
@@ -93,9 +82,8 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
-  std::string AuxiliaryMemoryName =
-      formatv("/{0}auxmem{1}", ParentTID, ParentPID);
+    pid_t ParentPID, int CounterFileDescriptor) {
+  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ParentPID);
   int AuxiliaryMemoryFileDescriptor =
       shm_open(AuxiliaryMemoryName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFileDescriptor == -1)
@@ -109,8 +97,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     return make_error<Failure>("Mapping auxiliary memory failed");
   AuxiliaryMemoryMapping[0] = CounterFileDescriptor;
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string MemoryValueName =
-        formatv("/{0}t{1}memdef{2}", ParentPID, ParentTID, MemVal.Index);
+    std::string MemoryValueName = "/" + std::to_string(ParentPID) + "memdef" +
+                                  std::to_string(MemVal.Index);
     AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] =
         shm_open(MemoryValueName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
     if (AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] == -1)
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 572d1085d9cff..e20b50cdc8118 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -35,9 +35,6 @@ class SubprocessMemory {
   static constexpr const size_t AuxiliaryMemoryOffset = 1;
   static constexpr const size_t AuxiliaryMemorySize = 4096;
 
-  // Gets the thread ID for the calling thread.
-  static long getCurrentTID();
-
   Error initializeSubprocessMemory(pid_t ProcessID);
 
   // The following function sets up memory definitions. It creates shared
@@ -57,7 +54,7 @@ class SubprocessMemory {
   // section.
   static Expected<int> setupAuxiliaryMemoryInSubprocess(
       std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-      pid_t ParentPID, long ParentTID, int CounterFileDescriptor);
+      pid_t ParentPID, int CounterFileDescriptor);
 
   ~SubprocessMemory();
 
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index 7c23e7b7e9c5a..c07ec188a602c 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -17,7 +17,6 @@
 #include <endian.h>
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/syscall.h>
 #include <unistd.h>
 #endif // __linux__
 
@@ -50,9 +49,7 @@ class SubprocessMemoryTest : public X86TestBase {
 
   std::string getSharedMemoryName(const unsigned TestNumber,
                                   const unsigned DefinitionNumber) {
-    long CurrentTID = syscall(SYS_gettid);
-    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "t" +
-           std::to_string(CurrentTID) + "memdef" +
+    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "memdef" +
            std::to_string(DefinitionNumber);
   }
 

From b274b23665dec30f3ae4fb83ccca8b77e6d3ada3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 12 Mar 2024 08:55:03 +0000
Subject: [PATCH 87/95] [ValueTracking] Treat phi as underlying obj when not
 decomposing further (#84339)

At the moment, getUnderlyingObjects simply continues for phis that do
not refer to the same underlying object in loops, without adding them to
the list of underlying objects, effectively ignoring those phis.

Instead of ignoring those phis, add them to the list of underlying
objects. This fixes a miscompile where LoopAccessAnalysis fails to
identify a memory dependence, because no underlying objects can be found
for a set of memory accesses.

Fixes https://github.com/llvm/llvm-project/issues/82665.

PR: https://github.com/llvm/llvm-project/pull/84339
---
 llvm/lib/Analysis/ValueTracking.cpp                        | 2 ++
 .../underlying-object-loop-varying-phi.ll                  | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d7f60d85b4523..371ad41ee9656 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6131,6 +6131,8 @@ void llvm::getUnderlyingObjects(const Value *V,
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         append_range(Worklist, PN->incoming_values());
+      else
+        Objects.push_back(P);
       continue;
     }
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
index 1a5a6ac08d404..106dc8c13a49f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
@@ -7,8 +7,13 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'indirect_ptr_recurrences_read_write'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndidrectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %ptr.recur, align 4, !tbaa !4 ->
+; CHECK-NEXT:            store i32 %xor, ptr %ptr.recur, align 4, !tbaa !4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:

From 939f038296e601abd9143955f1b347aee1e99c06 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Tue, 12 Mar 2024 10:29:19 +0100
Subject: [PATCH 88/95] [flang] lower vector subscripted polymorphic
 designators (#84778)

A mold argument need to be added to the hlfir.element_addr and set in
lowering so that when the hlfir.element_addr need to be turned into an
hlfir.elemental operation because the designator must be turned into a
value, the mold can be set on the hlfir.elemental to later allocate the
temporary according the the dynamic type.

This situation happens whenever the vector subscripted polymorphic
designator does not appear as an assignment left-hand side, or as an
IO-input item.


I initially thought retrieving the mold would be tricky if the dynamic
type of the designator was set by a part-ref of the right of the vector
subscripts ("array(vector)%polymorphic_comp"), but this turned out to be
impossible because:
1. A derived type component can be polymorphic only if it has the
POINTER or ALLOCATABLE attribute (F2023 C708).
2. Vector-subscripted part are ranked and F2023 C919 prohibits any
part-ref on the right of the rank part to have the POINTER or
ALLOCATABLE attribute.

=> If a vector subscripted designator is polymorphic, the vector
subscripted part is the rightmost part, and the mold is the base of the
vector subscripted part. This makes the retrieval of the mold easy in
lowering. The mold argument is always set to be the base of the vector
subscripted part when lowering the vector subscripted part, and it is
removed at the end of the designator lowering if the designator is not
polymorphic. This way there is no need to find back the mold from the
inside of the hlfir.element_addr body.
---
 .../include/flang/Optimizer/HLFIR/HLFIROps.td | 13 +++--
 flang/lib/Lower/ConvertExprToHLFIR.cpp        | 27 +++++-----
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |  6 +--
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     | 50 ++++++++++---------
 flang/test/HLFIR/element-addr.fir             | 42 ++++++++++++++++
 .../Lower/HLFIR/vector-subscript-as-value.f90 | 36 ++++++++++++-
 6 files changed, 132 insertions(+), 42 deletions(-)

diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index c82eae154d31a..743a6c98ec1a0 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -1358,7 +1358,9 @@ def hlfir_YieldOp : hlfir_Op<"yield", [Terminator, ParentOneOf<["RegionAssignOp"
   let assemblyFormat = "$entity attr-dict `:` type($entity) custom<YieldOpCleanup>($cleanup)";
 }
 
-def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"RegionAssignOp">, RecursiveMemoryEffects, RecursivelySpeculatable, hlfir_ElementalOpInterface]> {
+def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"RegionAssignOp">,
+    RecursiveMemoryEffects, RecursivelySpeculatable, hlfir_ElementalOpInterface,
+    AttrSizedOperandSegments]> {
   let summary = "Yield the address of a vector subscripted variable inside an hlfir.region_assign";
   let description = [{
     Special terminator node for the left-hand side region of an hlfir.region_assign
@@ -1398,6 +1400,7 @@ def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"R
 
   let arguments = (ins
     fir_ShapeType:$shape,
+    Optional<AnyPolymorphicObject>:$mold,
     Variadic<AnyIntegerType>:$typeparams,
     OptionalAttr<UnitAttr>:$unordered
   );
@@ -1406,11 +1409,15 @@ def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"R
                          MaxSizedRegion<1>:$cleanup);
 
   let builders = [
-    OpBuilder<(ins "mlir::Value":$shape, CArg<"bool", "false">:$isUnordered)>
+    OpBuilder<(ins "mlir::Value":$shape,
+          CArg<"mlir::Value", "{}">:$mold,
+      CArg<"mlir::ValueRange", "{}">:$typeparams,
+      CArg<"bool", "false">:$isUnordered)>
   ];
 
   let assemblyFormat = [{
-    $shape (`typeparams` $typeparams^)? (`unordered` $unordered^)?
+    $shape (`mold` $mold^)? (`typeparams` $typeparams^)?
+    (`unordered` $unordered^)?
     attr-dict `:` type(operands) $body
     custom<YieldOpCleanup>($cleanup)}];
 
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 731c5072c45c5..c5bfbdf6b8c11 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -761,9 +761,17 @@ class HlfirDesignatorBuilder {
     // of the whole designator (not the ones of the vector subscripted part).
     // These are not yet known and will be added when finalizing the designator
     // lowering.
-    auto elementalAddrOp =
-        builder.create<hlfir::ElementalAddrOp>(loc, shape,
-                                               /*isUnordered=*/true);
+    // The resulting designator may be polymorphic, in which case the resulting
+    // type is the base of the vector subscripted part because
+    // allocatable/pointer components cannot be referenced after a vector
+    // subscripted part. Set the mold to the current base. It will be erased if
+    // the resulting designator is not polymorphic.
+    assert(partInfo.base.has_value() &&
+           "vector subscripted part must have a base");
+    mlir::Value mold = *partInfo.base;
+    auto elementalAddrOp = builder.create<hlfir::ElementalAddrOp>(
+        loc, shape, mold, mlir::ValueRange{},
+        /*isUnordered=*/true);
     setVectorSubscriptElementAddrOp(elementalAddrOp);
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
     mlir::Region::BlockArgListType indices = elementalAddrOp.getIndices();
@@ -804,15 +812,8 @@ class HlfirDesignatorBuilder {
                              hlfir::EntityWithAttributes elementAddr) {
     fir::FirOpBuilder &builder = getBuilder();
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
-    // For polymorphic entities, it will be needed to add a mold on the
-    // hlfir.elemental so that we are able to create temporary storage
-    // for it using the dynamic type. It seems that a reference to the mold
-    // entity can be created by evaluating the hlfir.elemental_addr
-    // for a single index. The evaluation should be legal as long as
-    // the hlfir.elemental_addr has no side effects, otherwise,
-    // it is not clear how to get the mold reference.
-    if (elementAddr.isPolymorphic())
-      TODO(loc, "vector subscripted polymorphic entity in HLFIR");
+    if (!elementAddr.isPolymorphic())
+      elementalAddrOp.getMoldMutable().clear();
     builder.create<hlfir::YieldOp>(loc, elementAddr);
     builder.setInsertionPointAfter(elementalAddrOp);
   }
@@ -929,6 +930,8 @@ HlfirDesignatorBuilder::convertVectorSubscriptedExprToElementalAddr(
   hlfir::genLengthParameters(loc, builder, elementAddrEntity, lengths);
   if (!lengths.empty())
     elementalAddrOp.getTypeparamsMutable().assign(lengths);
+  if (!elementAddrEntity.isPolymorphic())
+    elementalAddrOp.getMoldMutable().clear();
   // Create the hlfir.yield terminator inside the hlfir.elemental_body.
   builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
   builder.create<hlfir::YieldOp>(loc, elementAddrEntity);
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 0e0b14e8d6909..c7a550814e1d5 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1036,9 +1036,9 @@ hlfir::cloneToElementalOp(mlir::Location loc, fir::FirOpBuilder &builder,
     return hlfir::loadTrivialScalar(l, b, newAddr);
   };
   mlir::Type elementType = scalarAddress.getFortranElementType();
-  return hlfir::genElementalOp(loc, builder, elementType,
-                               elementalAddrOp.getShape(), typeParams,
-                               genKernel, !elementalAddrOp.isOrdered());
+  return hlfir::genElementalOp(
+      loc, builder, elementType, elementalAddrOp.getShape(), typeParams,
+      genKernel, !elementalAddrOp.isOrdered(), elementalAddrOp.getMold());
 }
 
 bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 3568fe202caf1..8bad4e445082d 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1406,33 +1406,45 @@ void hlfir::AsExprOp::getEffects(
 // ElementalOp
 //===----------------------------------------------------------------------===//
 
-void hlfir::ElementalOp::build(mlir::OpBuilder &builder,
-                               mlir::OperationState &odsState,
-                               mlir::Type resultType, mlir::Value shape,
-                               mlir::Value mold, mlir::ValueRange typeparams,
-                               bool isUnordered) {
+/// Common builder for ElementalOp and ElementalAddrOp to add the arguments and
+/// create the elemental body. Result and clean-up body must be handled in
+/// specific builders.
+template <typename Op>
+static void buildElemental(mlir::OpBuilder &builder,
+                           mlir::OperationState &odsState, mlir::Value shape,
+                           mlir::Value mold, mlir::ValueRange typeparams,
+                           bool isUnordered) {
   odsState.addOperands(shape);
   if (mold)
     odsState.addOperands(mold);
   odsState.addOperands(typeparams);
-  odsState.addTypes(resultType);
   odsState.addAttribute(
-      getOperandSegmentSizesAttrName(odsState.name),
+      Op::getOperandSegmentSizesAttrName(odsState.name),
       builder.getDenseI32ArrayAttr({/*shape=*/1, (mold ? 1 : 0),
                                     static_cast<int32_t>(typeparams.size())}));
   if (isUnordered)
-    odsState.addAttribute(getUnorderedAttrName(odsState.name),
+    odsState.addAttribute(Op::getUnorderedAttrName(odsState.name),
                           isUnordered ? builder.getUnitAttr() : nullptr);
   mlir::Region *bodyRegion = odsState.addRegion();
   bodyRegion->push_back(new mlir::Block{});
-  if (auto exprType = resultType.dyn_cast<hlfir::ExprType>()) {
-    unsigned dim = exprType.getRank();
+  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+    unsigned dim = shapeType.getRank();
     mlir::Type indexType = builder.getIndexType();
     for (unsigned d = 0; d < dim; ++d)
       bodyRegion->front().addArgument(indexType, odsState.location);
   }
 }
 
+void hlfir::ElementalOp::build(mlir::OpBuilder &builder,
+                               mlir::OperationState &odsState,
+                               mlir::Type resultType, mlir::Value shape,
+                               mlir::Value mold, mlir::ValueRange typeparams,
+                               bool isUnordered) {
+  odsState.addTypes(resultType);
+  buildElemental<hlfir::ElementalOp>(builder, odsState, shape, mold, typeparams,
+                                     isUnordered);
+}
+
 mlir::Value hlfir::ElementalOp::getElementEntity() {
   return mlir::cast<hlfir::YieldElementOp>(getBody()->back()).getElementValue();
 }
@@ -1681,19 +1693,11 @@ static void printYieldOpCleanup(mlir::OpAsmPrinter &p, YieldOp yieldOp,
 
 void hlfir::ElementalAddrOp::build(mlir::OpBuilder &builder,
                                    mlir::OperationState &odsState,
-                                   mlir::Value shape, bool isUnordered) {
-  odsState.addOperands(shape);
-  if (isUnordered)
-    odsState.addAttribute(getUnorderedAttrName(odsState.name),
-                          isUnordered ? builder.getUnitAttr() : nullptr);
-  mlir::Region *bodyRegion = odsState.addRegion();
-  bodyRegion->push_back(new mlir::Block{});
-  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
-    unsigned dim = shapeType.getRank();
-    mlir::Type indexType = builder.getIndexType();
-    for (unsigned d = 0; d < dim; ++d)
-      bodyRegion->front().addArgument(indexType, odsState.location);
-  }
+                                   mlir::Value shape, mlir::Value mold,
+                                   mlir::ValueRange typeparams,
+                                   bool isUnordered) {
+  buildElemental<hlfir::ElementalAddrOp>(builder, odsState, shape, mold,
+                                         typeparams, isUnordered);
   // Push cleanUp region.
   odsState.addRegion();
 }
diff --git a/flang/test/HLFIR/element-addr.fir b/flang/test/HLFIR/element-addr.fir
index 73946f8b40e3d..c3c48edd9b563 100644
--- a/flang/test/HLFIR/element-addr.fir
+++ b/flang/test/HLFIR/element-addr.fir
@@ -114,3 +114,45 @@ func.func @unordered() {
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
+
+// "X(VECTOR) = Y" with polymorphic X and Y and user defined assignment.
+func.func @test_mold(%x: !fir.class<!fir.array<?x!fir.type<t>>>, %y: !fir.class<!fir.array<?x!fir.type<t>>>, %vector: !fir.box<!fir.array<?xi64>>) {
+  hlfir.region_assign {
+    hlfir.yield %y : !fir.class<!fir.array<?x!fir.type<t>>>
+  } to {
+    %c0 = arith.constant 0 : index
+    %0:3 = fir.box_dims %vector, %c0 : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+    %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+    hlfir.elemental_addr %1 mold %x unordered : !fir.shape<1>, !fir.class<!fir.array<?x!fir.type<t>>> {
+    ^bb0(%arg3: index):
+      %2 = hlfir.designate %vector (%arg3)  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+      %3 = fir.load %2 : !fir.ref<i64>
+      %4 = hlfir.designate %x (%3)  : (!fir.class<!fir.array<?x!fir.type<t>>>, i64) -> !fir.class<!fir.type<t>>
+      hlfir.yield %4 : !fir.class<!fir.type<t>>
+    }
+  } user_defined_assign  (%arg3: !fir.class<!fir.type<t>>) to (%arg4: !fir.class<!fir.type<t>>) {
+    fir.call @user_def_assign(%arg4, %arg3) : (!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>) -> ()
+  }
+  return
+}
+func.func private @user_def_assign(!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>)
+// CHECK-LABEL: func.func @test_mold(
+// CHECK-SAME:                       %[[VAL_0:[^:]*]]: !fir.class<!fir.array<?x!fir.type<t>>>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<t>>>,
+// CHECK-SAME:                       %[[VAL_2:.*]]: !fir.box<!fir.array<?xi64>>) {
+// CHECK:         hlfir.region_assign {
+// CHECK:           hlfir.yield %[[VAL_1]] : !fir.class<!fir.array<?x!fir.type<t>>>
+// CHECK:         } to {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+// CHECK:           hlfir.elemental_addr %[[VAL_5]] mold %[[VAL_0]] unordered : !fir.shape<1>, !fir.class<!fir.array<?x!fir.type<t>>> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index):
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_2]] (%[[VAL_6]])  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+// CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i64>
+// CHECK:             %[[VAL_9:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_8]])  : (!fir.class<!fir.array<?x!fir.type<t>>>, i64) -> !fir.class<!fir.type<t>>
+// CHECK:             hlfir.yield %[[VAL_9]] : !fir.class<!fir.type<t>>
+// CHECK:           }
+// CHECK:         } user_defined_assign  (%[[VAL_10:.*]]: !fir.class<!fir.type<t>>) to (%[[VAL_11:.*]]: !fir.class<!fir.type<t>>) {
+// CHECK:           fir.call @user_def_assign(%[[VAL_11]], %[[VAL_10]]) : (!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>) -> ()
+// CHECK:         }
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index 2f463cfaa8b07..d4026a37720f7 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -1,6 +1,6 @@
 ! Test lowering of vector subscript designators outside of the
 ! assignment left-and side and input IO context.
-! RUN: bbc -emit-hlfir -o - -I nw %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nw %s --polymorphic-type 2>&1 | FileCheck %s
 
 subroutine foo(x, y)
   integer :: x(100)
@@ -182,3 +182,37 @@ subroutine substring(c, vector, i, j)
 ! CHECK:    %[[VAL_27:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_26]]) substr %[[VAL_15]], %[[VAL_16]]  typeparams %[[VAL_22]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, i64, index, index, index) -> !fir.boxchar<1>
 ! CHECK:    hlfir.yield_element %[[VAL_27]] : !fir.boxchar<1>
 ! CHECK:  }
+
+subroutine test_passing_subscripted_poly(x, vector)
+  interface
+    subroutine do_something(x)
+      class(*) :: x(:)
+    end subroutine
+  end interface
+  class(*) :: x(:, :)
+  integer(8) :: vector(:)
+  call do_something(x(314, vector))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_passing_subscripted_poly(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.class<!fir.array<?x?xnone>>
+! CHECK-SAME:                                                %[[VAL_1:.*]]: !fir.box<!fir.array<?xi64>>
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 314 : index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] mold %[[VAL_3]]#0 unordered : (!fir.shape<1>, !fir.class<!fir.array<?x?xnone>>) -> !hlfir.expr<?xnone?> {
+! CHECK:           ^bb0(%[[VAL_9:.*]]: index):
+! CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_2]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<i64>
+! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_4]], %[[VAL_11]])  : (!fir.class<!fir.array<?x?xnone>>, index, i64) -> !fir.class<none>
+! CHECK:             hlfir.yield_element %[[VAL_12]] : !fir.class<none>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]]:3 = hlfir.associate %[[VAL_8]](%[[VAL_7]]) {adapt.valuebyref} : (!hlfir.expr<?xnone?>, !fir.shape<1>) -> (!fir.class<!fir.heap<!fir.array<?xnone>>>, !fir.class<!fir.heap<!fir.array<?xnone>>>, i1)
+! CHECK:           %[[VAL_14:.*]] = fir.rebox %[[VAL_13]]#0 : (!fir.class<!fir.heap<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPdo_something(%[[VAL_14]]) fastmath<contract> : (!fir.class<!fir.array<?xnone>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_13]]#0, %[[VAL_13]]#2 : !fir.class<!fir.heap<!fir.array<?xnone>>>, i1
+! CHECK:           hlfir.destroy %[[VAL_8]] : !hlfir.expr<?xnone?>
+! CHECK:           return
+! CHECK:         }

From 9d16e79aac09e68c46b89cbbad9fe7edd915b8c3 Mon Sep 17 00:00:00 2001
From: Dani <daniel.kiss@arm.com>
Date: Tue, 12 Mar 2024 10:33:16 +0100
Subject: [PATCH 89/95] [AArch64] Fix COMPILER_RT_HAS_AUXV for builtins.
 (#84816)

COMPILER_RT_HAS_AUXV is used now in builtins so the test need to be in
the builtin-config-ix.cmake too.
---
 compiler-rt/cmake/builtin-config-ix.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index d10222b7530a8..33c97b1ac28af 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -1,4 +1,5 @@
 include(BuiltinTests)
+include(CheckIncludeFiles)
 include(CheckCSourceCompiles)
 
 # Make all the tests only check the compiler
@@ -43,6 +44,8 @@ void foo(void)  __arm_streaming_compatible {
 }
 ")
 
+check_include_files("sys/auxv.h"    COMPILER_RT_HAS_AUXV)
+
 if(ANDROID)
   set(OS_NAME "Android")
 else()

From 368db5683bb9f8c619a8a6d3d15522429ef615c6 Mon Sep 17 00:00:00 2001
From: Luke Weiler <163067703+lwmaia@users.noreply.github.com>
Date: Tue, 12 Mar 2024 02:38:36 -0700
Subject: [PATCH 90/95] [lldb] Fix build break on windows (#84863)

This is a one line fix for a Windows specific (I believe) build break.

The build failure looks like this:
`D:\a\_work\1\s\lldb\source\Symbol\Symtab.cpp(128): error C2440:
'<function-style-cast>': cannot convert from 'lldb_private::ConstString'
to 'llvm::StringRef'
D:\a\_work\1\s\lldb\source\Symbol\Symtab.cpp(128): note:
'llvm::StringRef::StringRef': ambiguous call to overloaded function
D:\a\_work\1\s\llvm\include\llvm/ADT/StringRef.h(840): note: could be
'llvm::StringRef::StringRef(llvm::StringRef &&)'
D:\a\_work\1\s\llvm\include\llvm/ADT/StringRef.h(104): note: or
'llvm::StringRef::StringRef(std::string_view)'
D:\a\_work\1\s\lldb\source\Symbol\Symtab.cpp(128): note: while trying to
match the argument list '(lldb_private::ConstString)'
D:\a\_work\1\s\lldb\source\Symbol\Symtab.cpp(128): error C2672:
'std::multimap<llvm::StringRef,const lldb_private::Symbol
*,std::less<llvm::StringRef>,std::allocator<std::pair<const
llvm::StringRef,const lldb_private::Symbol *>>>::emplace': no matching
overloaded function found
C:\Program Files\Microsoft Visual
Studio\2022\Enterprise\VC\Tools\MSVC\14.37.32822\include\map(557): note:
could be
'std::_Tree_iterator<std::_Tree_val<std::_Tree_simple_types<std::pair<const
llvm::StringRef,const lldb_private::Symbol *>>>>
std::multimap<llvm::StringRef,const lldb_private::Symbol
*,std::less<llvm::StringRef>,std::allocator<std::pair<const
llvm::StringRef,const lldb_private::Symbol *>>>::emplace(_Valty &&...)'
`

The StringRef constructor here is intended to take a ConstString object,
which I assume is implicitly converted to a std::string_view by
compilers other than Visual Studio's. To fix the VS build I made the
StringRef initialization more explicit, as you can see in the diff.
---
 lldb/source/Symbol/Symtab.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index c63bbe94fece0..5b5bf5c3f6f8c 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -125,7 +125,7 @@ void Symtab::Dump(Stream *s, Target *target, SortOrder sort_order,
 
       std::multimap<llvm::StringRef, const Symbol *> name_map;
       for (const Symbol &symbol : m_symbols)
-        name_map.emplace(llvm::StringRef(symbol.GetName()), &symbol);
+        name_map.emplace(symbol.GetName().GetStringRef(), &symbol);
 
       for (const auto &name_to_symbol : name_map) {
         const Symbol *symbol = name_to_symbol.second;

From a3b52509d522442915a51d8aabcec1df49e95b23 Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Tue, 12 Mar 2024 10:39:37 +0100
Subject: [PATCH 91/95] [InstSimpliy] Use range attribute to simplify
 comparisons (#84627)

Use the new range attribute from https://github.com/llvm/llvm-project/pull/84617
to simplify comparisons where both sides have range information.
---
 llvm/include/llvm/IR/Attributes.h             |   5 +
 llvm/include/llvm/IR/Function.h               |   3 +
 llvm/include/llvm/IR/InstrTypes.h             |  12 ++
 llvm/lib/Analysis/InstructionSimplify.cpp     |  38 +++--
 llvm/lib/IR/Function.cpp                      |   4 +
 .../test/Transforms/InstCombine/icmp-range.ll | 144 +++++++++++++++++-
 6 files changed, 187 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 0c2a02514ba0e..7dd8a329029a3 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -848,6 +848,11 @@ class AttributeList {
     return getAttributeAtIndex(FunctionIndex, Kind);
   }
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttr(Attribute::AttrKind Kind) const {
+    return getAttributeAtIndex(ReturnIndex, Kind);
+  }
+
   /// Return the alignment of the return value.
   MaybeAlign getRetAlignment() const;
 
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index cb87a44980321..d96d506a9b05d 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -430,6 +430,9 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
   /// Return the attribute for the given attribute kind.
   Attribute getFnAttribute(StringRef Kind) const;
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttribute(Attribute::AttrKind Kind) const;
+
   /// For a string attribute \p Kind, parse attribute as an integer.
   ///
   /// \returns \p Default if attribute is not present.
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 0e81d3b391a08..fed21b992e3d1 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1909,6 +1909,18 @@ class CallBase : public Instruction {
   /// Determine whether the return value has the given attribute.
   bool hasRetAttr(StringRef Kind) const { return hasRetAttrImpl(Kind); }
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttr(Attribute::AttrKind Kind) const {
+    Attribute RetAttr = Attrs.getRetAttr(Kind);
+    if (RetAttr.isValid())
+      return RetAttr;
+
+    // Look at the callee, if available.
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getRetAttr(Kind);
+    return Attribute();
+  }
+
   /// Determine whether the argument or parameter has the given attribute.
   bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 8c48174b9f525..ce651783caf16 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3729,6 +3729,26 @@ static Value *simplifyICmpWithIntrinsicOnLHS(CmpInst::Predicate Pred,
   }
 }
 
+/// Helper method to get range from metadata or attribute.
+static std::optional<ConstantRange> getRange(Value *V,
+                                             const InstrInfoQuery &IIQ) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (MDNode *MD = IIQ.getMetadata(I, LLVMContext::MD_range))
+      return getConstantRangeFromMetadata(*MD);
+
+  Attribute Range;
+  if (const Argument *A = dyn_cast<Argument>(V)) {
+    Range = A->getAttribute(llvm::Attribute::Range);
+  } else if (const CallBase *CB = dyn_cast<CallBase>(V)) {
+    Range = CB->getRetAttr(llvm::Attribute::Range);
+  }
+
+  if (Range.isValid())
+    return Range.getRange();
+
+  return std::nullopt;
+}
+
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -3776,24 +3796,14 @@ static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
-  if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
-    auto RHS_Instr = cast<Instruction>(RHS);
-    auto LHS_Instr = cast<Instruction>(LHS);
-
-    if (Q.IIQ.getMetadata(RHS_Instr, LLVMContext::MD_range) &&
-        Q.IIQ.getMetadata(LHS_Instr, LLVMContext::MD_range)) {
-      auto RHS_CR = getConstantRangeFromMetadata(
-          *RHS_Instr->getMetadata(LLVMContext::MD_range));
-      auto LHS_CR = getConstantRangeFromMetadata(
-          *LHS_Instr->getMetadata(LLVMContext::MD_range));
-
-      if (LHS_CR.icmp(Pred, RHS_CR))
+  if (std::optional<ConstantRange> RhsCr = getRange(RHS, Q.IIQ))
+    if (std::optional<ConstantRange> LhsCr = getRange(LHS, Q.IIQ)) {
+      if (LhsCr->icmp(Pred, *RhsCr))
         return ConstantInt::getTrue(ITy);
 
-      if (LHS_CR.icmp(CmpInst::getInversePredicate(Pred), RHS_CR))
+      if (LhsCr->icmp(CmpInst::getInversePredicate(Pred), *RhsCr))
         return ConstantInt::getFalse(ITy);
     }
-  }
 
   // Compare of cast, for example (zext X) != 0 -> X != 0
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 056e4f31981a7..d22e1c1231118 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -700,6 +700,10 @@ Attribute Function::getFnAttribute(StringRef Kind) const {
   return AttributeSets.getFnAttr(Kind);
 }
 
+Attribute Function::getRetAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.getRetAttr(Kind);
+}
+
 uint64_t Function::getFnAttributeAsParsedInteger(StringRef Name,
                                                  uint64_t Default) const {
   Attribute A = getFnAttribute(Name);
diff --git a/llvm/test/Transforms/InstCombine/icmp-range.ll b/llvm/test/Transforms/InstCombine/icmp-range.ll
index 77bb5fdb6bfd4..9ed2f2a4860c6 100644
--- a/llvm/test/Transforms/InstCombine/icmp-range.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-range.ll
@@ -149,6 +149,16 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly
   ret i1 %rval
 }
 
+; Values' ranges overlap each other, so it can not be simplified.
+define i1 @test_two_attribute_ranges(i32 range(i32 5, 10) %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_attribute_ranges(
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[ARG1:%.*]], [[ARG2:%.*]]
+; CHECK-NEXT:    ret i1 [[RVAL]]
+;
+  %rval = icmp ult i32 %arg2, %arg1
+  ret i1 %rval
+}
+
 ; Values' ranges do not overlap each other, so it can simplified to false.
 define i1 @test_two_ranges2(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
 ; CHECK-LABEL: @test_two_ranges2(
@@ -160,6 +170,35 @@ define i1 @test_two_ranges2(ptr nocapture readonly %arg1, ptr nocapture readonly
   ret i1 %rval
 }
 
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_argument_ranges(i32 range(i32 1, 6) %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges(
+; CHECK-NEXT:    ret i1 false
+;
+  %rval = icmp ult i32 %arg2, %arg1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_range_and_one_argument_range(ptr nocapture readonly %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_one_range_and_one_argument_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = load i32, ptr %arg1, !range !0
+  %rval = icmp ult i32 %arg2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_argument_range_and_one_range(i32 range(i32 1, 6) %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_one_argument_range_and_one_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = load i32, ptr %arg2, !range !6
+  %rval = icmp ult i32 %val1, %arg1
+  ret i1 %rval
+}
+
 ; Values' ranges do not overlap each other, so it can simplified to true.
 define i1 @test_two_ranges3(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
 ; CHECK-LABEL: @test_two_ranges3(
@@ -186,8 +225,8 @@ define <2 x i1> @test_two_ranges_vec(ptr nocapture readonly %arg1, ptr nocapture
 }
 
 ; Values' ranges do not overlap each other, so it can simplified to false.
-define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
-; CHECK-LABEL: @test_two_ranges_vec_true(
+define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_two_ranges_vec_false(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %val1 = load <2 x i32>, ptr %arg1, !range !0
@@ -196,9 +235,9 @@ define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr noca
   ret <2 x i1> %rval
 }
 
-; Values' ranges do not overlap each other, so it can simplified to false.
-define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
-; CHECK-LABEL: @test_two_ranges_vec_false(
+; Values' ranges do not overlap each other, so it can simplified to true.
+define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_two_ranges_vec_true(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %val1 = load <2 x i32>, ptr %arg1, !range !0
@@ -207,6 +246,101 @@ define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr noc
   ret <2 x i1> %rval
 }
 
+; Values' ranges overlap each other, so it can not be simplified.
+define <2 x i1> @test_two_argument_ranges_vec(<2 x i32> range(i32 5, 10) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec(
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult <2 x i32> [[VAL2:%.*]], [[VAL1:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[RVAL]]
+;
+  %rval = icmp ult <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define <2 x i1> @test_two_argument_ranges_vec_false(<2 x i32> range(i32 1, 6) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec_false(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rval = icmp ult <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to true.
+define <2 x i1> @test_two_argument_ranges_vec_true(<2 x i32> range(i32 1, 6) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec_true(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %rval = icmp ugt <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+declare i32 @create_range1()
+declare range(i32 8, 16) i32 @create_range2()
+declare range(i32 1, 6) i32 @create_range3()
+
+; Values' ranges overlap each other, so it can not be simplified.
+define i1 @test_two_return_attribute_ranges_not_simplified() {
+; CHECK-LABEL: @test_two_return_attribute_ranges_not_simplified(
+; CHECK-NEXT:    [[ARG2:%.*]] = call range(i32 5, 10) i32 @create_range1()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    ret i1 [[RVAL]]
+;
+  %val1 = call range(i32 5, 10) i32 @create_range1()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_return_attribute_ranges_one_in_call() {
+; CHECK-LABEL: @test_two_return_attribute_ranges_one_in_call(
+; CHECK-NEXT:    [[VAL1:%.*]] = call range(i32 1, 6) i32 @create_range1()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call range(i32 1, 6) i32 @create_range1()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_return_attribute_ranges() {
+; CHECK-LABEL: @test_two_return_attribute_ranges(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_return_argument_and_one_argument_range(i32 range(i32 8, 16) %arg1) {
+; CHECK-LABEL: @test_one_return_argument_and_one_argument_range(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %rval = icmp ult i32 %arg1, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_range_and_one_return_argument(ptr nocapture readonly %arg1) {
+; CHECK-LABEL: @test_one_range_and_one_return_argument(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %val2 = load i32, ptr %arg1, !range !6
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
 define i1 @ugt_zext(i1 %b, i8 %x) {
 ; CHECK-LABEL: @ugt_zext(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[X:%.*]], 0

From bba4a1daff6ee09941f1369a4e56b4af95efdc5c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 12 Mar 2024 09:47:42 +0000
Subject: [PATCH 92/95] [ArgPromotion] Remove incorrect TranspBlocks set for
 loads. (#84835)

The TranspBlocks set was used to cache aliasing decision for all
processed loads in the parent loop. This is incorrect, because each load
can access a different location, which means one load not being modified
in a block doesn't translate to another load not being modified in the
same block.

All loads access the same underlying object, so we could perhaps use a
location without size for all loads and retain the cache, but that would
mean we loose precision.

For now, just drop the cache.

Fixes https://github.com/llvm/llvm-project/issues/84807

PR: https://github.com/llvm/llvm-project/pull/84835
---
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp      |  6 +-----
 ...aliasing-and-non-aliasing-loads-with-clobber.ll | 14 +++++++-------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index e89ec353487ee..3aa8ea3f51471 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -653,10 +653,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   // check to see if the pointer is guaranteed to not be modified from entry of
   // the function to each of the load instructions.
 
-  // Because there could be several/many load instructions, remember which
-  // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
-
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -670,7 +666,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
     for (BasicBlock *P : predecessors(BB)) {
-      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+      for (BasicBlock *TranspBB : inverse_depth_first(P))
         if (AAR.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
index 69385a7ea51a7..1e1669b29b0db 100644
--- a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -7,17 +7,14 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:
 
 ; Test case for https://github.com/llvm/llvm-project/issues/84807.
 
-; FIXME: Currently the loads from @callee are moved to @caller, even though
-;        the store in %then may aliases to load from %q.
+; Make sure the loads from @callee are not moved to @caller, as the store
+; in %then may aliases to load from %q.
 
 define i32 @caller1(i1 %c) {
 ; CHECK-LABEL: define i32 @caller1(
 ; CHECK-SAME: i1 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[F_VAL:%.*]] = load i16, ptr @f, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr @f, i64 8
-; CHECK-NEXT:    [[F_VAL1:%.*]] = load i64, ptr [[TMP0]], align 8
-; CHECK-NEXT:    call void @callee1(i16 [[F_VAL]], i64 [[F_VAL1]], i1 [[C]])
+; CHECK-NEXT:    call void @callee1(ptr noundef nonnull @f, i1 [[C]])
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -27,13 +24,16 @@ entry:
 
 define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
 ; CHECK-LABEL: define internal void @callee1(
-; CHECK-SAME: i16 [[Q_0_VAL:%.*]], i64 [[Q_8_VAL:%.*]], i1 [[C:%.*]]) {
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    store i16 123, ptr @f, align 8
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
 ; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
 ; CHECK-NEXT:    ret void
 ;

From 9228859c2a5aed307dc61edb4cfd6bee7b4c5949 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard@amd.com>
Date: Tue, 12 Mar 2024 10:07:02 +0000
Subject: [PATCH 93/95] [CMake] Add tablegen job pool support (#84762)

Add the ability to set the number of tablegen jobs that can run in
parallel
similar to the LLVM_PARALLEL_[COMPILE|LINK]_JOBS options that already
exist.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 24 +++++++++++++++++++++-
 llvm/cmake/modules/TableGen.cmake          |  7 +++++++
 llvm/docs/CMake.rst                        |  8 ++++++++
 llvm/docs/GettingStarted.rst               |  6 +++---
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index eca2962cf8207..745a8354f1189 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -36,7 +36,7 @@ string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
   "Define the maximum number of concurrent compilation jobs (Ninja only).")
-if(LLVM_RAM_PER_COMPILE_JOB OR LLVM_RAM_PER_LINK_JOB)
+if(LLVM_RAM_PER_COMPILE_JOB OR LLVM_RAM_PER_LINK_JOB OR LLVM_RAM_PER_TABLEGEN_JOB)
   cmake_host_system_information(RESULT available_physical_memory QUERY AVAILABLE_PHYSICAL_MEMORY)
   cmake_host_system_information(RESULT number_of_logical_cores QUERY NUMBER_OF_LOGICAL_CORES)
 endif()
@@ -86,6 +86,28 @@ elseif(LLVM_PARALLEL_LINK_JOBS)
   message(WARNING "Job pooling is only available with Ninja generators.")
 endif()
 
+set(LLVM_PARALLEL_TABLEGEN_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent tablegen jobs (Ninja only).")
+if(LLVM_RAM_PER_TABLEGEN_JOB)
+  math(EXPR jobs_with_sufficient_memory "${available_physical_memory} / ${LLVM_RAM_PER_TABLEGEN_JOB}" OUTPUT_FORMAT DECIMAL)
+  if (jobs_with_sufficient_memory LESS 1)
+    set(jobs_with_sufficient_memory 1)
+  endif()
+  if (jobs_with_sufficient_memory LESS number_of_logical_cores)
+    set(LLVM_PARALLEL_TABLEGEN_JOBS "${jobs_with_sufficient_memory}")
+  else()
+    set(LLVM_PARALLEL_TABLEGEN_JOBS "${number_of_logical_cores}")
+  endif()
+endif()
+if(LLVM_PARALLEL_TABLEGEN_JOBS)
+  if(NOT CMAKE_GENERATOR MATCHES "Ninja")
+    message(WARNING "Job pooling is only available with Ninja generators.")
+  else()
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS tablegen_job_pool=${LLVM_PARALLEL_TABLEGEN_JOBS})
+    # Job pool for tablegen is set on the add_custom_command
+  endif()
+endif()
+
 if( LLVM_ENABLE_ASSERTIONS )
   # MSVC doesn't like _DEBUG on release builds. See PR 4379.
   if( NOT MSVC )
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 1d18fdde2bb98..df91598c404f5 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -125,6 +125,12 @@ function(tablegen project ofn)
   set(tablegen_exe ${${project}_TABLEGEN_EXE})
   set(tablegen_depends ${${project}_TABLEGEN_TARGET} ${tablegen_exe})
 
+  if(LLVM_PARALLEL_TABLEGEN_JOBS)
+    set(LLVM_TABLEGEN_JOB_POOL JOB_POOL tablegen_job_pool)
+  else()
+    set(LLVM_TABLEGEN_JOB_POOL "")
+  endif()
+
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
     COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS} -I ${CMAKE_CURRENT_SOURCE_DIR}
     ${tblgen_includes}
@@ -139,6 +145,7 @@ function(tablegen project ofn)
       ${local_tds} ${global_tds}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
     ${LLVM_TARGET_DEPENDS}
+    ${LLVM_TABLEGEN_JOB_POOL}
     COMMENT "Building ${ofn}..."
     )
 
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 1490b38feb1eb..d2f66d71d39af 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,6 +762,9 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
+**LLVM_PARALLEL_TABLEGEN_JOBS**:STRING
+  Define the maximum number of concurrent tablegen jobs.
+
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
   Value has to be in MB, overwrites LLVM_PARALLEL_COMPILE_JOBS. Compile jobs 
@@ -775,6 +778,11 @@ enabled sub-projects. Nearly all of these variable names begin with
   to be sure its not terminated in your memory restricted environment. On ELF
   platforms also consider ``LLVM_USE_SPLIT_DWARF`` in Debug build.
 
+**LLVM_RAM_PER_TABLEGEN_JOB**:STRING
+  Calculates the amount of Ninja tablegen jobs according to available resources.
+  Value has to be in MB, overwrites LLVM_PARALLEL_TABLEGEN_JOBS. Tablegen jobs
+  will be between one and amount of logical cores.
+
 **LLVM_PROFDATA_FILE**:PATH
   Path to a profdata file to pass into clang's -fprofile-instr-use flag. This
   can only be specified if you're building with clang.
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 7634199babbad..705f6427d9ed5 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -90,11 +90,11 @@ Getting the Source Code and Building LLVM
        is installed on your system. This can dramatically speed up link times
        if the default linker is slow.
 
-     * ``-DLLVM_PARALLEL_{COMPILE,LINK}_JOBS=N`` --- Limit the number of
-       compile/link jobs running in parallel at the same time. This is
+     * ``-DLLVM_PARALLEL_{COMPILE,LINK,TABLEGEN}_JOBS=N`` --- Limit the number of
+       compile/link/tablegen jobs running in parallel at the same time. This is
        especially important for linking since linking can use lots of memory. If
        you run into memory issues building LLVM, try setting this to limit the
-       maximum number of compile/link jobs running at the same time.
+       maximum number of compile/link/tablegen jobs running at the same time.
 
    * ``cmake --build build [--target <target>]`` or the build system specified
      above directly.

From ce1fd9281707c2163728085d126ff83041e1db51 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@google.com>
Date: Tue, 12 Mar 2024 11:19:48 +0100
Subject: [PATCH 94/95] Update test past
 bdbad0d07bb600301cb324e87a6be37ca4af591a (#84889)

---
 .../data-formatter/builtin-formats/TestBuiltinFormats.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py b/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
index 8c3bdabeaac1b..4d6f44db0195b 100644
--- a/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
+++ b/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
@@ -308,5 +308,5 @@ def test_pointer(self):
     @no_debug_info_test
     def test_instruction(self):
         self.assertIn(
-            "  addq   0xa(%rdi), %r8\n", self.getFormatted("instruction", "0x0a47034c")
+            "= addq   0xa(%rdi), %r8\n", self.getFormatted("instruction", "0x0a47034c")
         )

From 9997e0397156ff7e01aecbd17bdeb7bfe5fb15b0 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Tue, 12 Mar 2024 10:25:58 +0000
Subject: [PATCH 95/95] [RemoveDIs] Update DIBuilder to conditionally insert
 DbgRecords (#84739)

Have DIBuilder conditionally insert either debug intrinsics or DbgRecord
depending on the module's IsNewDbgInfoFormat flag. The insertion methods
now return a `DbgInstPtr` (a `PointerUnion<Instruction *, DbgRecord
*>`).

Add a unittest for both modes (I couldn't find an existing test testing
insertion behaviours specifically).

This patch changes the existing assumption that DbgRecords are only ever
inserted if there's an instruction to insert-before because clang
currently inserts debug intrinsics while CodeGening (like any other
instruction) meaning it'll try inserting to the end of a block without a
terminator. We already have machinery in place to maintain the
DbgRecords when a terminator is removed - these become "trailing
DbgRecords" which are re-attached when a new instruction is inserted.
All I've done is allow this state to occur while inserting DbgRecords
too, i.e., it's not only removing terminators that causes this valid
transient state, but inserting DbgRecords into incomplete blocks too.

The C API will be updated in follow up patches.

---

Note: this doesn't mean clang is emitting DbgRecords yet, because the
modules it creates are still always in the old debug mode. That will
come in a future patch.
---
 llvm/include/llvm/IR/DIBuilder.h              |  73 +++++----
 llvm/lib/IR/BasicBlock.cpp                    |  13 +-
 llvm/lib/IR/DIBuilder.cpp                     | 131 ++++++++++-----
 llvm/lib/IR/DebugInfo.cpp                     |  89 +++++-----
 llvm/lib/IR/Instruction.cpp                   |   3 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |  47 +++---
 llvm/lib/Transforms/Utils/Local.cpp           |  12 +-
 .../Utils/PromoteMemoryToRegister.cpp         |  25 ++-
 llvm/unittests/IR/IRBuilderTest.cpp           | 152 +++++++++++++++---
 9 files changed, 358 insertions(+), 187 deletions(-)

diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index edec161b39715..94af17af8160e 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -38,6 +38,9 @@ namespace llvm {
   class Module;
   class Value;
   class DbgAssignIntrinsic;
+  class DbgRecord;
+
+  using DbgInstPtr = PointerUnion<Instruction *, DbgRecord *>;
 
   class DIBuilder {
     Module &M;
@@ -90,13 +93,17 @@ namespace llvm {
     void trackIfUnresolved(MDNode *N);
 
     /// Internal helper for insertDeclare.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             BasicBlock *InsertBB, Instruction *InsertBefore);
 
     /// Internal helper for insertLabel.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           BasicBlock *InsertBB, Instruction *InsertBefore);
+
+    /// Internal helper. Track metadata if untracked and insert \p DPV.
+    void insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
+                       Instruction *InsertBefore, bool InsertAtHead = false);
 
     /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic.
     Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val,
@@ -106,10 +113,11 @@ namespace llvm {
                                     Instruction *InsertBefore);
 
     /// Internal helper for insertDbgValueIntrinsic.
-    Instruction *
-    insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo,
-                            DIExpression *Expr, const DILocation *DL,
-                            BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       BasicBlock *InsertBB,
+                                       Instruction *InsertBefore);
 
   public:
     /// Construct a builder for a module.
@@ -921,9 +929,9 @@ namespace llvm {
     /// \param Expr        A complex location expression.
     /// \param DL          Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               BasicBlock *InsertAtEnd);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.assign intrinsic call.
     /// \param LinkedInstr   Instruction with a DIAssignID to link with the new
@@ -939,11 +947,10 @@ namespace llvm {
     /// \param DL            Debug info location, usually: (line: 0,
     ///                      column: 0, scope: var-decl-scope). See
     ///                      getDebugValueLoc.
-    DbgAssignIntrinsic *insertDbgAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *SrcVar,
-                                        DIExpression *ValExpr, Value *Addr,
-                                        DIExpression *AddrExpr,
-                                        const DILocation *DL);
+    DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val,
+                               DILocalVariable *SrcVar, DIExpression *ValExpr,
+                               Value *Addr, DIExpression *AddrExpr,
+                               const DILocation *DL);
 
     /// Insert a new llvm.dbg.declare intrinsic call.
     /// \param Storage      llvm::Value of the variable
@@ -951,23 +958,23 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               Instruction *InsertBefore);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             Instruction *InsertBefore);
 
     /// Insert a new llvm.dbg.label intrinsic call.
     /// \param LabelInfo    Label's debug info descriptor.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             Instruction *InsertBefore);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           Instruction *InsertBefore);
 
     /// Insert a new llvm.dbg.label intrinsic call.
     /// \param LabelInfo    Label's debug info descriptor.
     /// \param DL           Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             BasicBlock *InsertAtEnd);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
@@ -975,11 +982,10 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertDbgValueIntrinsic(llvm::Value *Val,
-                                         DILocalVariable *VarInfo,
-                                         DIExpression *Expr,
-                                         const DILocation *DL,
-                                         BasicBlock *InsertAtEnd);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
@@ -987,11 +993,10 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertDbgValueIntrinsic(llvm::Value *Val,
-                                         DILocalVariable *VarInfo,
-                                         DIExpression *Expr,
-                                         const DILocation *DL,
-                                         Instruction *InsertBefore);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       Instruction *InsertBefore);
 
     /// Replace the vtable holder in the given type.
     ///
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index c188d2f912d16..673e2f68249cd 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -754,8 +754,6 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // occur when a block is optimised away and the terminator has been moved
   // somewhere else.
   if (Src->empty()) {
-    assert(Dest != end() &&
-           "Transferring trailing DPValues to another trailing position");
     DPMarker *SrcTrailingDPValues = Src->getTrailingDPValues();
     if (!SrcTrailingDPValues)
       return;
@@ -1040,15 +1038,10 @@ void BasicBlock::insertDPValueAfter(DbgRecord *DPV, Instruction *I) {
 
 void BasicBlock::insertDPValueBefore(DbgRecord *DPV,
                                      InstListType::iterator Where) {
-  // We should never directly insert at the end of the block, new DPValues
-  // shouldn't be generated at times when there's no terminator.
-  assert(Where != end());
-  assert(Where->getParent() == this);
-  if (!Where->DbgMarker)
-    createMarker(Where);
+  assert(Where == end() || Where->getParent() == this);
   bool InsertAtHead = Where.getHeadBit();
-  createMarker(&*Where);
-  Where->DbgMarker->insertDPValue(DPV, InsertAtHead);
+  DPMarker *M = createMarker(Where);
+  M->insertDPValue(DPV, InsertAtHead);
 }
 
 DPMarker *BasicBlock::getNextMarker(Instruction *I) {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 62efaba025344..c0643f63c9725 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -925,35 +925,47 @@ DILexicalBlock *DIBuilder::createLexicalBlock(DIScope *Scope, DIFile *File,
                                      File, Line, Col);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    Instruction *InsertBefore) {
   return insertDeclare(Storage, VarInfo, Expr, DL, InsertBefore->getParent(),
                        InsertBefore);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    BasicBlock *InsertAtEnd) {
   // If this block already has a terminator then insert this intrinsic before
   // the terminator. Otherwise, put it at the end of the block.
   Instruction *InsertBefore = InsertAtEnd->getTerminator();
   return insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd, InsertBefore);
 }
 
-DbgAssignIntrinsic *
-DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
-                           DILocalVariable *SrcVar, DIExpression *ValExpr,
-                           Value *Addr, DIExpression *AddrExpr,
-                           const DILocation *DL) {
+DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
+                                      DILocalVariable *SrcVar,
+                                      DIExpression *ValExpr, Value *Addr,
+                                      DIExpression *AddrExpr,
+                                      const DILocation *DL) {
+  auto *Link = cast_or_null<DIAssignID>(
+      LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
+  assert(Link && "Linked instruction must have DIAssign metadata attached");
+
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPVAssign(Val, SrcVar, ValExpr, Link, Addr,
+                                            AddrExpr, DL);
+    BasicBlock *InsertBB = LinkedInstr->getParent();
+    // Insert after LinkedInstr.
+    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+    Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt;
+    insertDPValue(DPV, InsertBB, InsertBefore, true);
+    return DPV;
+  }
+
   LLVMContext &Ctx = LinkedInstr->getContext();
   Module *M = LinkedInstr->getModule();
   if (!AssignFn)
     AssignFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
 
-  auto *Link = LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID);
-  assert(Link && "Linked instruction must have DIAssign metadata attached");
-
   std::array<Value *, 6> Args = {
       MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
       MetadataAsValue::get(Ctx, SrcVar),
@@ -971,35 +983,36 @@ DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   return DVI;
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  Instruction *InsertBefore) {
   return insertLabel(LabelInfo, DL,
                      InsertBefore ? InsertBefore->getParent() : nullptr,
                      InsertBefore);
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  BasicBlock *InsertAtEnd) {
   return insertLabel(LabelInfo, DL, InsertAtEnd, nullptr);
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                                DILocalVariable *VarInfo,
-                                                DIExpression *Expr,
-                                                const DILocation *DL,
-                                                Instruction *InsertBefore) {
-  Instruction *DVI = insertDbgValueIntrinsic(
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
+                                              DILocalVariable *VarInfo,
+                                              DIExpression *Expr,
+                                              const DILocation *DL,
+                                              Instruction *InsertBefore) {
+  DbgInstPtr DVI = insertDbgValueIntrinsic(
       V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
       InsertBefore);
-  cast<CallInst>(DVI)->setTailCall();
+  if (DVI.is<Instruction *>())
+    cast<CallInst>(DVI.get<Instruction *>())->setTailCall();
   return DVI;
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                                DILocalVariable *VarInfo,
-                                                DIExpression *Expr,
-                                                const DILocation *DL,
-                                                BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
+                                              DILocalVariable *VarInfo,
+                                              DIExpression *Expr,
+                                              const DILocation *DL,
+                                              BasicBlock *InsertAtEnd) {
   return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
 }
 
@@ -1023,24 +1036,37 @@ static Function *getDeclareIntrin(Module &M) {
   return Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
     llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
     const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPValue(Val, VarInfo, Expr, DL);
+    insertDPValue(DPV, InsertBB, InsertBefore);
+    return DPV;
+  }
+
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
   return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB,
                             InsertBefore);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      BasicBlock *InsertBB,
-                                      Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    BasicBlock *InsertBB,
+                                    Instruction *InsertBefore) {
   assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.declare");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
+
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPVDeclare(Storage, VarInfo, Expr, DL);
+    insertDPValue(DPV, InsertBB, InsertBefore);
+    return DPV;
+  }
+
   if (!DeclareFn)
     DeclareFn = getDeclareIntrin(M);
 
@@ -1055,6 +1081,23 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return B.CreateCall(DeclareFn, Args);
 }
 
+void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
+                              Instruction *InsertBefore, bool InsertAtHead) {
+  assert(InsertBefore || InsertBB);
+  trackIfUnresolved(DPV->getVariable());
+  trackIfUnresolved(DPV->getExpression());
+  if (DPV->isDbgAssign())
+    trackIfUnresolved(DPV->getAddressExpression());
+
+  BasicBlock::iterator InsertPt;
+  if (InsertBB && InsertBefore)
+    InsertPt = InsertBefore->getIterator();
+  else if (InsertBB)
+    InsertPt = InsertBB->end();
+  InsertPt.setHeadBit(InsertAtHead);
+  InsertBB->insertDPValueBefore(DPV, InsertPt);
+}
+
 Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
                                            Value *V, DILocalVariable *VarInfo,
                                            DIExpression *Expr,
@@ -1081,18 +1124,28 @@ Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
   return B.CreateCall(IntrinsicFn, Args);
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    BasicBlock *InsertBB,
-                                    Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  BasicBlock *InsertBB,
+                                  Instruction *InsertBefore) {
   assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
              LabelInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
+
+  trackIfUnresolved(LabelInfo);
+  if (M.IsNewDbgInfoFormat) {
+    DPLabel *DPL = new DPLabel(LabelInfo, DL);
+    if (InsertBB && InsertBefore)
+      InsertBB->insertDPValueBefore(DPL, InsertBefore->getIterator());
+    else if (InsertBB)
+      InsertBB->insertDPValueBefore(DPL, InsertBB->end());
+    return DPL;
+  }
+
   if (!LabelFn)
     LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label);
 
-  trackIfUnresolved(LabelInfo);
   Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
 
   IRBuilder<> B(DL->getContext());
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 1f3ff2246a445..68fd244e25697 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1663,43 +1663,47 @@ LLVMValueRef
 LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                  LLVMMetadataRef DL, LLVMValueRef Instr) {
-  return wrap(unwrap(Builder)->insertDeclare(
-                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
-                  unwrap<Instruction>(Instr)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
-    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
-    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
-  return wrap(unwrap(Builder)->insertDeclare(
-                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
-                  unwrap(Block)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
-                                               LLVMValueRef Val,
-                                               LLVMMetadataRef VarInfo,
-                                               LLVMMetadataRef Expr,
-                                               LLVMMetadataRef DebugLoc,
-                                               LLVMValueRef Instr) {
-  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
-                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
-                  unwrap<Instruction>(Instr)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
-                                              LLVMValueRef Val,
-                                              LLVMMetadataRef VarInfo,
-                                              LLVMMetadataRef Expr,
-                                              LLVMMetadataRef DebugLoc,
-                                              LLVMBasicBlockRef Block) {
-  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
-                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
-                  unwrap(Block)));
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
+      unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+      unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+      unwrap<Instruction>(Instr));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef
+LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
+                                LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
+      unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+      unwrap<DIExpression>(Expr), unwrap<DILocation>(DL), unwrap(Block));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
+      unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
+      unwrap<DILocation>(DebugLoc), unwrap<Instruction>(Instr));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
+      unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
+      unwrap<DILocation>(DebugLoc), unwrap(Block));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
@@ -2115,10 +2119,15 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
     return;
   }
-  auto *Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr,
-                                     Dest, AddrExpr, VarRec.DL);
+  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
+                                    AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+  LLVM_DEBUG(if (!Assign.isNull()) {
+    if (Assign.is<DbgRecord *>())
+      errs() << " > INSERT: " << *Assign.get<DbgRecord *>() << "\n";
+    else
+      errs() << " > INSERT: " << *Assign.get<Instruction *>() << "\n";
+  });
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e863ef3eb8d6d..6b8c6e0c85ed9 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -166,7 +166,8 @@ void Instruction::insertBefore(BasicBlock &BB,
   }
 
   // If we're inserting a terminator, check if we need to flush out
-  // TrailingDPValues.
+  // TrailingDPValues. Inserting instructions at the end of an incomplete
+  // block is handled by the code block above.
   if (isTerminator())
     getParent()->flushTerminatorDbgValues();
 }
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e11b984f13bbc..190fee11618bf 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -324,23 +324,16 @@ static DebugVariable getAggregateVariable(DPValue *DPV) {
                        DPV->getDebugLoc().getInlinedAt());
 }
 
-static DPValue *createLinkedAssign(DPValue *, DIBuilder &DIB,
-                                   Instruction *LinkedInstr, Value *NewValue,
-                                   DILocalVariable *Variable,
-                                   DIExpression *Expression, Value *Address,
-                                   DIExpression *AddressExpression,
-                                   const DILocation *DI) {
-  (void)DIB;
-  return DPValue::createLinkedDPVAssign(LinkedInstr, NewValue, Variable,
-                                        Expression, Address, AddressExpression,
-                                        DI);
+/// Helpers for handling new and old debug info modes in migrateDebugInfo.
+/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
+/// on the \p Unused parameter type.
+DPValue *UnwrapDbgInstPtr(DbgInstPtr P, DPValue *Unused) {
+  (void)Unused;
+  return static_cast<DPValue *>(cast<DbgRecord *>(P));
 }
-static DbgAssignIntrinsic *createLinkedAssign(
-    DbgAssignIntrinsic *, DIBuilder &DIB, Instruction *LinkedInstr,
-    Value *NewValue, DILocalVariable *Variable, DIExpression *Expression,
-    Value *Address, DIExpression *AddressExpression, const DILocation *DI) {
-  return DIB.insertDbgAssign(LinkedInstr, NewValue, Variable, Expression,
-                             Address, AddressExpression, DI);
+DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
+  (void)Unused;
+  return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
 }
 
 /// Find linked dbg.assign and generate a new one with the correct
@@ -398,7 +391,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
   assert(OldAlloca->isStaticAlloca());
 
-  auto MigrateDbgAssign = [&](auto DbgAssign) {
+  auto MigrateDbgAssign = [&](auto *DbgAssign) {
     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
                       << "\n");
     auto *Expr = DbgAssign->getExpression();
@@ -452,10 +445,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     }
 
     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
-    auto *NewAssign = createLinkedAssign(
-        DbgAssign, DIB, Inst, NewValue, DbgAssign->getVariable(), Expr, Dest,
-        DIExpression::get(Expr->getContext(), std::nullopt),
-        DbgAssign->getDebugLoc());
+    auto *NewAssign = UnwrapDbgInstPtr(
+        DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
+                            Dest,
+                            DIExpression::get(Expr->getContext(), std::nullopt),
+                            DbgAssign->getDebugLoc()),
+        DbgAssign);
 
     // If we've updated the value but the original dbg.assign has an arglist
     // then kill it now - we can't use the requested new value.
@@ -5031,9 +5026,11 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  auto *NewAssign = DIB.insertDbgAssign(
-      NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
-      Orig->getAddressExpression(), Orig->getDebugLoc());
+  Instruction *NewAssign =
+      DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(),
+                          NewFragmentExpr, NewAddr,
+                          Orig->getAddressExpression(), Orig->getDebugLoc())
+          .get<Instruction *>();
   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
   (void)NewAssign;
 }
@@ -5052,7 +5049,7 @@ static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  auto *NewAssign = DPValue::createLinkedDPVAssign(
+  DPValue *NewAssign = DPValue::createLinkedDPVAssign(
       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
       Orig->getAddressExpression(), Orig->getDebugLoc());
   LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n");
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d3bb89075015e..a44536e34c922 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1649,9 +1649,9 @@ static void insertDbgValueOrDPValue(DIBuilder &Builder, Value *DV,
                                     const DebugLoc &NewLoc,
                                     BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
-    auto *DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                   (Instruction *)nullptr);
-    DbgVal->insertBefore(Instr);
+    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
+                                                  (Instruction *)nullptr);
+    DbgVal.get<Instruction *>()->insertBefore(Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
     // DPValue directly instead of a dbg.value intrinsic.
@@ -1667,9 +1667,9 @@ static void insertDbgValueOrDPValueAfter(DIBuilder &Builder, Value *DV,
                                          const DebugLoc &NewLoc,
                                          BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
-    auto *DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                   (Instruction *)nullptr);
-    DbgVal->insertAfter(&*Instr);
+    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
+                                                  (Instruction *)nullptr);
+    DbgVal.get<Instruction *>()->insertAfter(&*Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
     // DPValue directly instead of a dbg.value intrinsic.
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 88b05aab8db4d..b462803bad38c 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -101,21 +101,20 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 
 namespace {
 
-static DPValue *createDebugValue(DIBuilder &DIB, Value *NewValue,
-                                 DILocalVariable *Variable,
-                                 DIExpression *Expression, const DILocation *DI,
-                                 DPValue *InsertBefore) {
+static void createDebugValue(DIBuilder &DIB, Value *NewValue,
+                             DILocalVariable *Variable,
+                             DIExpression *Expression, const DILocation *DI,
+                             DPValue *InsertBefore) {
+  // FIXME: Merge these two functions now that DIBuilder supports DPValues.
+  // We neeed the API to accept DPValues as an insert point for that to work.
   (void)DIB;
-  return DPValue::createDPValue(NewValue, Variable, Expression, DI,
-                                *InsertBefore);
+  DPValue::createDPValue(NewValue, Variable, Expression, DI, *InsertBefore);
 }
-static DbgValueInst *createDebugValue(DIBuilder &DIB, Value *NewValue,
-                                      DILocalVariable *Variable,
-                                      DIExpression *Expression,
-                                      const DILocation *DI,
-                                      Instruction *InsertBefore) {
-  return static_cast<DbgValueInst *>(DIB.insertDbgValueIntrinsic(
-      NewValue, Variable, Expression, DI, InsertBefore));
+static void createDebugValue(DIBuilder &DIB, Value *NewValue,
+                             DILocalVariable *Variable,
+                             DIExpression *Expression, const DILocation *DI,
+                             Instruction *InsertBefore) {
+  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, InsertBefore);
 }
 
 /// Helper for updating assignment tracking debug info when promoting allocas.
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index d15ff9dd51a4c..cece65974c013 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -871,25 +871,139 @@ TEST_F(IRBuilderTest, createFunction) {
 }
 
 TEST_F(IRBuilderTest, DIBuilder) {
-  IRBuilder<> Builder(BB);
-  DIBuilder DIB(*M);
-  auto File = DIB.createFile("F.CBL", "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                  DIB.createFile("F.CBL", "/"), "llvm-cobol74",
-                                  true, "", 0);
-  auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(std::nullopt));
-  auto SP = DIB.createFunction(
-      CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
-      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
-  F->setSubprogram(SP);
-  AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
-  auto BarSP = DIB.createFunction(
-      CU, "bar", "", File, 1, Type, 1, DINode::FlagZero,
-      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
-  auto BadScope = DIB.createLexicalBlockFile(BarSP, File, 0);
-  I->setDebugLoc(DILocation::get(Ctx, 2, 0, BadScope));
-  DIB.finalize();
-  EXPECT_TRUE(verifyModule(*M));
+  auto GetLastDbgRecord = [](const Instruction *I) -> DbgRecord * {
+    if (I->getDbgValueRange().empty())
+      return nullptr;
+    return &*std::prev(I->getDbgValueRange().end());
+  };
+
+  auto ExpectOrder = [&](DbgInstPtr First, BasicBlock::iterator Second) {
+    if (M->IsNewDbgInfoFormat) {
+      EXPECT_TRUE(First.is<DbgRecord *>());
+      EXPECT_FALSE(Second->getDbgValueRange().empty());
+      EXPECT_EQ(GetLastDbgRecord(&*Second), First.get<DbgRecord *>());
+    } else {
+      EXPECT_TRUE(First.is<Instruction *>());
+      EXPECT_EQ(&*std::prev(Second), First.get<Instruction *>());
+    }
+  };
+
+  auto RunTest = [&]() {
+    IRBuilder<> Builder(BB);
+    DIBuilder DIB(*M);
+    auto File = DIB.createFile("F.CBL", "/");
+    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
+                                    DIB.createFile("F.CBL", "/"),
+                                    "llvm-cobol74", true, "", 0);
+    auto Type =
+        DIB.createSubroutineType(DIB.getOrCreateTypeArray(std::nullopt));
+    auto SP = DIB.createFunction(
+        CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
+    F->setSubprogram(SP);
+    AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
+    auto BarSP = DIB.createFunction(
+        CU, "bar", "", File, 1, Type, 1, DINode::FlagZero,
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
+    auto BarScope = DIB.createLexicalBlockFile(BarSP, File, 0);
+    I->setDebugLoc(DILocation::get(Ctx, 2, 0, BarScope));
+
+    // Create another instruction so that there's one before the alloca we're
+    // inserting debug intrinsics before, to make end-checking easier.
+    I = Builder.CreateAlloca(Builder.getInt1Ty());
+
+    // Label metadata and records
+    // --------------------------
+    DILocation *LabelLoc = DILocation::get(Ctx, 1, 0, BarScope);
+    DILabel *AlwaysPreserveLabel = DIB.createLabel(
+        BarScope, "meles_meles", File, 1, /*AlwaysPreserve*/ true);
+    DILabel *Label =
+        DIB.createLabel(BarScope, "badger", File, 1, /*AlwaysPreserve*/ false);
+
+    { /* dbg.label | DPLabel */
+      // Insert before I and check order.
+      ExpectOrder(DIB.insertLabel(Label, LabelLoc, I), I->getIterator());
+
+      // We should be able to insert at the end of the block, even if there's
+      // no terminator yet. Note that in RemoveDIs mode this record won't get
+      // inserted into the block untill another instruction is added.
+      DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB);
+      // Specifically do not insert a terminator, to check this works. `I`
+      // should have absorbed the DPLabel in the new debug info mode.
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(LabelRecord, I->getIterator());
+    }
+
+    // Variable metadata and records
+    // -----------------------------
+    DILocation *VarLoc = DILocation::get(Ctx, 2, 0, BarScope);
+    auto *IntType = DIB.createBasicType("int", 32, dwarf::DW_ATE_signed);
+    DILocalVariable *VarX =
+        DIB.createAutoVariable(BarSP, "X", File, 2, IntType, true);
+    DILocalVariable *VarY =
+        DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true);
+    { /* dbg.value | DPValue::Value */
+      ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(),
+                                              VarLoc, I),
+                  I->getIterator());
+      // Check inserting at end of the block works as with labels.
+      DbgInstPtr VarXValue = DIB.insertDbgValueIntrinsic(
+          I, VarX, DIB.createExpression(), VarLoc, BB);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarXValue, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+    { /* dbg.declare | DPValue::Declare */
+      ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I),
+                  I->getIterator());
+      // Check inserting at end of the block works as with labels.
+      DbgInstPtr VarYDeclare =
+          DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, BB);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarYDeclare, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+    { /* dbg.assign | DPValue::Assign */
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      I->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(Ctx));
+      // DbgAssign interface is slightly different - it always inserts after the
+      // linked instr. Check we can do this with no instruction to insert
+      // before.
+      DbgInstPtr VarXAssign =
+          DIB.insertDbgAssign(I, I, VarX, DIB.createExpression(), I,
+                              DIB.createExpression(), VarLoc);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarXAssign, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+
+    Builder.CreateRet(nullptr);
+    DIB.finalize();
+    // Check the labels are not/are added to Bar's retainedNodes array
+    // (AlwaysPreserve).
+    EXPECT_EQ(find(BarSP->getRetainedNodes(), Label),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), AlwaysPreserveLabel),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), VarX),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), VarY),
+              BarSP->getRetainedNodes().end());
+    EXPECT_TRUE(verifyModule(*M));
+  };
+
+  // Test in old-debug mode.
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  RunTest();
+
+  // Test in new-debug mode.
+  // Reset the test then call convertToNewDbgValues to flip the flag
+  // on the test's Module, Function and BasicBlock.
+  TearDown();
+  SetUp();
+  M->convertToNewDbgValues();
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
+  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {