diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 1e7b2d2a36c24..9e670c447fbad 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -54,4 +54,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets}
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 9561bf668a90c..52ba13036f915 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -62,4 +62,4 @@ cmake -S ${MONOREPO_ROOT}/llvm -B ${BUILD_DIR} \
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
-ninja -C "${BUILD_DIR}" ${targets}
+ninja -C "${BUILD_DIR}" -k 0 ${targets}
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 1dba91746dae5..131ad3004f457 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -71,8 +71,8 @@ jobs:
       # | X.Y.Z     | -final
       run: |
         tag="${{ github.ref_name }}"
-        trimmed=$(echo ${{ inputs.tag }} | xargs)
-        [[ "$trimmed" != "" ]] && tag="$trimmed"
+        trimmed=$(echo ${{ inputs.release-version }} | xargs)
+        [[ "$trimmed" != "" ]] && tag="llvmorg-$trimmed"
         if [ "$tag" = "main" ]; then
           # If tag is main, then we've been triggered by a scheduled so pass so
           # use the head commit as the tag.
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 1a7792afbbd94..384e63695dfdc 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -345,8 +345,13 @@ void DWARF5AcceleratorTable::finalize() {
 std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
 DWARF5AcceleratorTable::getIndexForEntry(
     const BOLTDWARF5AccelTableData &Value) const {
+  // The foreign TU list immediately follows the local TU list and they both
+  // use the same index, so that if there are N local TU entries, the index for
+  // the first foreign TU is N.
   if (Value.isTU())
-    return {{Value.getUnitID(), {dwarf::DW_IDX_type_unit, TUIndexForm}}};
+    return {{(Value.getSecondUnitID() ? (unsigned)LocalTUList.size() : 0) +
+                 Value.getUnitID(),
+             {dwarf::DW_IDX_type_unit, TUIndexForm}}};
   if (CUList.size() > 1)
     return {{Value.getUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
   return std::nullopt;
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s
new file mode 100644
index 0000000000000..68eee45ec9833
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s
@@ -0,0 +1,314 @@
+# struct AMono {
+#   int x;
+# };
+#
+# AMono globalMono;
+# # clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -o
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "/home" "helper.cpp" md5 0x3c0ac73d7b074961c6e8202230a76228
+	.section	.debug_info,"G",@progbits,6412503741467814911,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	6412503741467814911             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x20 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	4                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.type	globalMono,@object              # @globalMono
+	.bss
+	.globl	globalMono
+	.p2align	2, 0x0
+globalMono:
+	.zero	4
+	.size	globalMono, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	6                               # Abbrev [6] 0x1e:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x29:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	6412503741467814911             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	32                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=104
+.Linfo_string2:
+	.asciz	"/home" # string offset=115
+.Linfo_string3:
+	.asciz	"globalMono"                    # string offset=153
+.Linfo_string4:
+	.asciz	"AMono"                         # string offset=164
+.Linfo_string5:
+	.asciz	"x"                             # string offset=170
+.Linfo_string6:
+	.asciz	"int"                           # string offset=172
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string4
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalMono
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	193495088                       # Hash in Bucket 2
+	.long	253228319                       # Hash in Bucket 2
+	.long	-857151761                      # Hash in Bucket 2
+	.long	.Linfo_string6                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 2: AMono
+	.long	.Linfo_string3                  # String in Bucket 2: globalMono
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L1:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	51                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L2:
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	3                               # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: AMono
+.Lnames2:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	30                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalMono
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s
new file mode 100644
index 0000000000000..8b28c19dc87de
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s
@@ -0,0 +1,315 @@
+# struct BMono {
+#   int x;
+# };
+#
+# BMono globalMono1;
+# clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -o
+
+
+	.text
+	.file	"helper1.cpp"
+	.file	0 "/home" "helper1.cpp" md5 0x1fdaf911330b73495aed962bc02cfb3a
+	.section	.debug_info,"G",@progbits,5884764266900841573,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5884764266900841573             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x20 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	4                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.type	globalMono1,@object             # @globalMono1
+	.bss
+	.globl	globalMono1
+	.p2align	2, 0x0
+globalMono1:
+	.zero	4
+	.size	globalMono1, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	6                               # Abbrev [6] 0x1e:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	41                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x29:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	5884764266900841573             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	32                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper1.cpp"                   # string offset=104
+.Linfo_string2:
+	.asciz	"/home" # string offset=116
+.Linfo_string3:
+	.asciz	"globalMono1"                   # string offset=154
+.Linfo_string4:
+	.asciz	"BMono"                         # string offset=166
+.Linfo_string5:
+	.asciz	"x"                             # string offset=172
+.Linfo_string6:
+	.asciz	"int"                           # string offset=174
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string4
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalMono1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	193495088                       # Hash in Bucket 2
+	.long	254414240                       # Hash in Bucket 2
+	.long	1778763008                      # Hash in Bucket 2
+	.long	.Linfo_string6                  # String in Bucket 2: int
+	.long	.Linfo_string4                  # String in Bucket 2: BMono
+	.long	.Linfo_string3                  # String in Bucket 2: globalMono1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L1:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	51                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L2:
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	3                               # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: BMono
+.Lnames2:
+.L0:
+	.byte	4                               # Abbreviation code
+	.long	30                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalMono1
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s
new file mode 100644
index 0000000000000..69f6c5a5376a0
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s
@@ -0,0 +1,505 @@
+# struct ASplit {
+#   int x;
+# };
+#
+# ASplit globalSplit;
+# int main() {
+#   return 0;
+# }
+# clang++ -g2 -gdwarf-5 -gpubnames -S -fdebug-types-section -gsplit-dwarf -fdebug-compilation-dir='.'
+
+	.text
+	.file	"main.cpp"
+	.file	0 "." "main.cpp" md5 0xbb74a3c2960dafa324547ebbd87d13ea
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-8602855756067469281            # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x1e DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	49                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x31:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 6 0                           # main.cpp:6:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 7 3 prologue_end              # main.cpp:7:3
+	xorl	%eax, %eax
+	.loc	0 7 3 epilogue_begin is_stmt 0  # main.cpp:7:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	globalSplit,@object             # @globalSplit
+	.bss
+	.globl	globalSplit
+	.p2align	2, 0x0
+globalSplit:
+	.zero	4
+	.size	globalSplit, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5806847994123082226
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"ASplit"                        # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=9
+.Lskel_string3:
+	.asciz	"globalSplit"                   # string offset=13
+.Lskel_string4:
+	.asciz	"main"                          # string offset=25
+.Lskel_string5:
+	.asciz	"main.dwo"                      # string offset=30
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string5
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	40                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"globalSplit"                   # string offset=0
+.Linfo_string1:
+	.asciz	"."                             # string offset=12
+.Linfo_string2:
+	.asciz	"main.dwo"                      # string offset=14
+.Linfo_string3:
+	.asciz	"x"                             # string offset=23
+.Linfo_string4:
+	.asciz	"int"                           # string offset=25
+.Linfo_string5:
+	.asciz	"ASplit"                        # string offset=29
+.Linfo_string6:
+	.asciz	"main"                          # string offset=36
+.Linfo_string7:
+	.asciz	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)" # string offset=41
+.Linfo_string8:
+	.asciz	"main.cpp"                      # string offset=145
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	12
+	.long	14
+	.long	23
+	.long	25
+	.long	29
+	.long	36
+	.long	41
+	.long	145
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5806847994123082226
+	.byte	5                               # Abbrev [5] 0x14:0x2e DW_TAG_compile_unit
+	.byte	7                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	8                               # DW_AT_name
+	.byte	2                               # DW_AT_dwo_name
+	.byte	6                               # Abbrev [6] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	7                               # Abbrev [7] 0x25:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-8602855756067469281            # DW_AT_signature
+	.byte	8                               # Abbrev [8] 0x2e:0xf DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	61                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	4                               # Abbrev [4] 0x3d:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	1
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	1
+	.ascii	"main.cpp"
+	.byte	0
+	.byte	0
+	.byte	0xbb, 0x74, 0xa3, 0xc2
+	.byte	0x96, 0x0d, 0xaf, 0xa3
+	.byte	0x24, 0x54, 0x7e, 0xbb
+	.byte	0xd8, 0x7d, 0x13, 0xea
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	globalSplit
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	1                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	-8602855756067469281            # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	1785912162                      # Hash in Bucket 2
+	.long	2090499946                      # Hash in Bucket 2
+	.long	-226250862                      # Hash in Bucket 2
+	.long	.Lskel_string2                  # String in Bucket 0: int
+	.long	.Lskel_string3                  # String in Bucket 2: globalSplit
+	.long	.Lskel_string4                  # String in Bucket 2: main
+	.long	.Lskel_string1                  # String in Bucket 2: ASplit
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L5:
+	.byte	1                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	49                              # DW_IDX_die_offset
+.L1:                                    # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	61                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames2:
+.L0:
+	.byte	3                               # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: globalSplit
+.Lnames3:
+.L2:
+	.byte	4                               # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames0:
+.L4:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+.L3:                                    # DW_IDX_parent
+	.byte	6                               # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: ASplit
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:llvm/llvm-project.git ced1fac8a32e35b63733bda27c7f5b9a2b635403)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
new file mode 100644
index 0000000000000..8a8a4b118b8c0
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-main-debug-names-ftu-ltu-mix.test
@@ -0,0 +1,56 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-ftu-ltu-mix-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper.s -o helper.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-ftu-ltu-mix-helper1.s -o helper1.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o helper1.o -o main.exe -fno-pic -no-pie
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-names main.exe.bolt | FileCheck -check-prefix=BOLT %s
+
+;; Tests BOLT correctly sets foreign TU Index when there are local TUs.
+
+; BOLT:        Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: {{.+}}
+; BOLT-NEXT:     CU[1]: {{.+}}
+; BOLT-NEXT:     CU[2]: {{.+}}
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Local Type Unit offsets [
+; BOLT-NEXT:     LocalTU[0]: {{.+}}
+; BOLT-NEXT:     LocalTU[1]: {{.+}}
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: 0x889c84450dac881f
+; BOLT-NEXT:   ]
+; BOLT:        Name 3 {
+; BOLT-NEXT:     Hash: 0x6A05C500
+; BOLT-NEXT:     String: {{.+}} "globalMono1"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x5
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: 0x02
+; BOLT-NEXT:       DW_IDX_die_offset: 0x0000001e
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }
+; BOLT:        Name 6 {
+; BOLT-NEXT:     Hash: 0xF283AF92
+; BOLT-NEXT:     String: {{.+}} "ASplit"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x7
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: 0x02
+; BOLT-NEXT:       DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:       DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }
+; BOLT:        Name 7 {
+; BOLT-NEXT:     Hash: 0xF17F51F
+; BOLT-NEXT:     String: {{.+}} "AMono"
+; BOLT-NEXT:     Entry @ {{.+}} {
+; BOLT-NEXT:       Abbrev: 0x4
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: 0x00
+; BOLT-NEXT:       DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:     }
+; BOLT-NEXT:   }
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 2f6dd0611b662..25d2f03e0b366 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -544,7 +544,7 @@ TEST(DiagnosticTest, RespectsDiagnosticConfig) {
                   Diag(Main.range("ret"),
                        "void function 'x' should not return a value")));
   Config Cfg;
-  Cfg.Diagnostics.Suppress.insert("return-type");
+  Cfg.Diagnostics.Suppress.insert("return-mismatch");
   WithContextValue WithCfg(Config::Key, std::move(Cfg));
   EXPECT_THAT(TU.build().getDiagnostics(),
               ElementsAre(Diag(Main.range(),
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bce27dc8c4a99..88e552d5c4611 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -183,6 +183,9 @@ Deprecated Compiler Flags
 
 Modified Compiler Flags
 -----------------------
+- Added a new diagnostic flag ``-Wreturn-mismatch`` which is grouped under
+  ``-Wreturn-type``, and moved some of the diagnostics previously controlled by
+  ``-Wreturn-type`` under this new flag. Fixes #GH72116.
 
 Removed Compiler Flags
 -------------------------
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index ba1d4b2352e3d..3f14167d6b846 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -617,7 +617,9 @@ def GNURedeclaredEnum : DiagGroup<"gnu-redeclared-enum">;
 def RedundantMove : DiagGroup<"redundant-move">;
 def Register : DiagGroup<"register", [DeprecatedRegister]>;
 def ReturnTypeCLinkage : DiagGroup<"return-type-c-linkage">;
-def ReturnType : DiagGroup<"return-type", [ReturnTypeCLinkage]>;
+def ReturnMismatch : DiagGroup<"return-mismatch">;
+def ReturnType : DiagGroup<"return-type", [ReturnTypeCLinkage, ReturnMismatch]>;
+
 def BindToTemporaryCopy : DiagGroup<"bind-to-temporary-copy",
                                     [CXX98CompatBindToTemporaryCopy]>;
 def SelfAssignmentField : DiagGroup<"self-assign-field">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9b5245695153e..c54105507753e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10248,14 +10248,14 @@ def warn_second_parameter_to_va_arg_never_compatible : Warning<
 
 def warn_return_missing_expr : Warning<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
-  InGroup<ReturnType>;
+  InGroup<ReturnMismatch>;
 def ext_return_missing_expr : ExtWarn<
   "non-void %select{function|method}1 %0 should return a value">, DefaultError,
-  InGroup<ReturnType>;
+  InGroup<ReturnMismatch>;
 def ext_return_has_expr : ExtWarn<
   "%select{void function|void method|constructor|destructor}1 %0 "
   "should not return a value">,
-  DefaultError, InGroup<ReturnType>;
+  DefaultError, InGroup<ReturnMismatch>;
 def ext_return_has_void_expr : Extension<
   "void %select{function|method|block}1 %0 should not return void expression">;
 def err_return_init_list : Error<
diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
index 71d4d9894f420..9ac948ded3e33 100644
--- a/clang/include/clang/InstallAPI/Visitor.h
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/Twine.h"
 
 namespace clang {
+struct AvailabilityInfo;
 namespace installapi {
 
 /// ASTVisitor for collecting declarations that represent global symbols.
@@ -33,6 +34,7 @@ class InstallAPIVisitor final : public ASTConsumer,
         MC(ItaniumMangleContext::create(ASTCtx, ASTCtx.getDiagnostics())),
         Layout(ASTCtx.getTargetInfo().getDataLayoutString()) {}
   void HandleTranslationUnit(ASTContext &ASTCtx) override;
+  bool shouldVisitTemplateInstantiations() const { return true; }
 
   /// Collect global variables.
   bool VisitVarDecl(const VarDecl *D);
@@ -51,9 +53,19 @@ class InstallAPIVisitor final : public ASTConsumer,
   /// is therefore itself not collected.
   bool VisitObjCCategoryDecl(const ObjCCategoryDecl *D);
 
+  /// Collect global c++ declarations.
+  bool VisitCXXRecordDecl(const CXXRecordDecl *D);
+
 private:
   std::string getMangledName(const NamedDecl *D) const;
   std::string getBackendMangledName(llvm::Twine Name) const;
+  std::string getMangledCXXVTableName(const CXXRecordDecl *D) const;
+  std::string getMangledCXXThunk(const GlobalDecl &D,
+                                 const ThunkInfo &Thunk) const;
+  std::string getMangledCXXRTTI(const CXXRecordDecl *D) const;
+  std::string getMangledCXXRTTIName(const CXXRecordDecl *D) const;
+  std::string getMangledCtorDtor(const CXXMethodDecl *D, int Type) const;
+
   std::optional<HeaderType> getAccessForDecl(const NamedDecl *D) const;
   void recordObjCInstanceVariables(
       const ASTContext &ASTCtx, llvm::MachO::ObjCContainerRecord *Record,
@@ -61,6 +73,8 @@ class InstallAPIVisitor final : public ASTConsumer,
       const llvm::iterator_range<
           DeclContext::specific_decl_iterator<ObjCIvarDecl>>
           Ivars);
+  void emitVTableSymbols(const CXXRecordDecl *D, const AvailabilityInfo &Avail,
+                         const HeaderType Access, bool EmittedVTable = false);
 
   InstallAPIContext &Ctx;
   SourceManager &SrcMgr;
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 43d221968ea3f..b701581b2474a 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1517,6 +1517,11 @@ void DeclPrinter::VisitObjCInterfaceDecl(ObjCInterfaceDecl *OID) {
     return;
   }
   bool eolnOut = false;
+  if (OID->hasAttrs()) {
+    prettyPrintAttributes(OID);
+    Out << "\n";
+  }
+
   Out << "@interface " << I;
 
   if (auto TypeParams = OID->getTypeParamListAsWritten()) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index a384e191464fe..86304a54473ce 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2959,6 +2959,8 @@ bool ByteCodeExprGen<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
+  if (SubExpr->getType()->isAnyComplexType())
+    return this->VisitComplexUnaryOperator(E);
   std::optional<PrimType> T = classify(SubExpr->getType());
 
   switch (E->getOpcode()) {
@@ -3109,16 +3111,97 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return false;
     return DiscardResult ? this->emitPop(*T, E) : this->emitComp(*T, E);
   case UO_Real: // __real x
-    if (T)
-      return this->delegate(SubExpr);
-    return this->emitComplexReal(SubExpr);
+    assert(T);
+    return this->delegate(SubExpr);
   case UO_Imag: { // __imag x
-    if (T) {
-      if (!this->discard(SubExpr))
+    assert(T);
+    if (!this->discard(SubExpr))
+      return false;
+    return this->visitZeroInitializer(*T, SubExpr->getType(), SubExpr);
+  }
+  case UO_Extension:
+    return this->delegate(SubExpr);
+  case UO_Coawait:
+    assert(false && "Unhandled opcode");
+  }
+
+  return false;
+}
+
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitComplexUnaryOperator(
+    const UnaryOperator *E) {
+  const Expr *SubExpr = E->getSubExpr();
+  assert(SubExpr->getType()->isAnyComplexType());
+
+  if (DiscardResult)
+    return this->discard(SubExpr);
+
+  std::optional<PrimType> ResT = classify(E);
+  auto prepareResult = [=]() -> bool {
+    if (!ResT && !Initializing) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(SubExpr, /*IsExtended=*/false);
+      if (!LocalIndex)
+        return false;
+      return this->emitGetPtrLocal(*LocalIndex, E);
+    }
+
+    return true;
+  };
+
+  // The offset of the temporary, if we created one.
+  unsigned SubExprOffset = ~0u;
+  auto createTemp = [=, &SubExprOffset]() -> bool {
+    SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false);
+    if (!this->visit(SubExpr))
+      return false;
+    return this->emitSetLocal(PT_Ptr, SubExprOffset, E);
+  };
+
+  PrimType ElemT = classifyComplexElementType(SubExpr->getType());
+  auto getElem = [=](unsigned Offset, unsigned Index) -> bool {
+    if (!this->emitGetLocal(PT_Ptr, Offset, E))
+      return false;
+    return this->emitArrayElemPop(ElemT, Index, E);
+  };
+
+  switch (E->getOpcode()) {
+  case UO_Minus:
+    if (!prepareResult())
+      return false;
+    if (!createTemp())
+      return false;
+    for (unsigned I = 0; I != 2; ++I) {
+      if (!getElem(SubExprOffset, I))
+        return false;
+      if (!this->emitNeg(ElemT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, I, E))
         return false;
-      return this->visitZeroInitializer(*T, SubExpr->getType(), SubExpr);
     }
+    break;
+
+  case UO_Plus:   // +x
+  case UO_AddrOf: // &x
+  case UO_Deref:  // *x
+    return this->delegate(SubExpr);
+
+  case UO_LNot:
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitComplexBoolCast(SubExpr))
+      return false;
+    if (!this->emitInvBool(E))
+      return false;
+    if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool)
+      return this->emitCast(PT_Bool, ET, E);
+    return true;
+
+  case UO_Real:
+    return this->emitComplexReal(SubExpr);
 
+  case UO_Imag:
     if (!this->visit(SubExpr))
       return false;
 
@@ -3131,14 +3214,12 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     // Since our _Complex implementation does not map to a primitive type,
     // we sometimes have to do the lvalue-to-rvalue conversion here manually.
     return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E);
-  }
-  case UO_Extension:
-    return this->delegate(SubExpr);
-  case UO_Coawait:
-    assert(false && "Unhandled opcode");
+
+  default:
+    return this->emitInvalid(E);
   }
 
-  return false;
+  return true;
 }
 
 template <class Emitter>
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5977bb5e6ff25..5ad2e74d7c269 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -75,6 +75,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitGNUNullExpr(const GNUNullExpr *E);
   bool VisitCXXThisExpr(const CXXThisExpr *E);
   bool VisitUnaryOperator(const UnaryOperator *E);
+  bool VisitComplexUnaryOperator(const UnaryOperator *E);
   bool VisitDeclRefExpr(const DeclRefExpr *E);
   bool VisitImplicitValueInitExpr(const ImplicitValueInitExpr *E);
   bool VisitSubstNonTypeTemplateParmExpr(const SubstNonTypeTemplateParmExpr *E);
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7310e3817c79a..82b30b8d81562 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -76,6 +76,7 @@
 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Instrumentation/RemoveTrapsPass.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
@@ -83,6 +84,7 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -98,6 +100,10 @@ using namespace llvm;
 namespace llvm {
 extern cl::opt<bool> PrintPipelinePasses;
 
+cl::opt<bool> ClRemoveTraps("clang-remove-traps", cl::Optional,
+                            cl::desc("Insert remove-traps pass."),
+                            cl::init(false));
+
 // Experiment to move sanitizers earlier.
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
@@ -744,6 +750,21 @@ static void addSanitizers(const Triple &TargetTriple,
     // LastEP does not need GlobalsAA.
     PB.registerOptimizerLastEPCallback(SanitizersCallback);
   }
+
+  if (ClRemoveTraps) {
+    // We can optimize after inliner, and PGO profile matching. The hook below
+    // is called at the end `buildFunctionSimplificationPipeline`, which called
+    // from `buildInlinerPipeline`, which called after profile matching.
+    PB.registerScalarOptimizerLateEPCallback(
+        [](FunctionPassManager &FPM, OptimizationLevel Level) {
+          // RemoveTrapsPass expects trap blocks preceded by conditional
+          // branches, which usually is not the case without SimplifyCFG.
+          // TODO: Remove `SimplifyCFGPass` after switching to dedicated
+          // intrinsic.
+          FPM.addPass(SimplifyCFGPass());
+          FPM.addPass(RemoveTrapsPass());
+        });
+  }
 }
 
 void EmitAssemblyHelper::RunOptimizationPipeline(
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 1f07205a5af22..a1e14c5f0a8c7 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -286,6 +286,14 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
   /* Things not handled yet include: C++ base classes, bitfields, */
 
   if (const RecordType *TTy = QTy->getAs<RecordType>()) {
+    if (TTy->isUnionType()) {
+      uint64_t Size = Context.getTypeSizeInChars(QTy).getQuantity();
+      llvm::MDNode *TBAAType = getChar();
+      llvm::MDNode *TBAATag = getAccessTagInfo(TBAAAccessInfo(TBAAType, Size));
+      Fields.push_back(
+          llvm::MDBuilder::TBAAStructField(BaseOffset, Size, TBAATag));
+      return true;
+    }
     const RecordDecl *RD = TTy->getDecl()->getDefinition();
     if (RD->hasFlexibleArrayMember())
       return false;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index fce43430a9137..190782a79a245 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4638,7 +4638,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  if (offloadDeviceOnly())
+  // All kinds exit now in device-only mode except for non-RDC mode HIP.
+  if (offloadDeviceOnly() &&
+      (!C.isOffloadingHostKind(Action::OFK_HIP) ||
+       !Args.hasFlag(options::OPT_gpu_bundle_output,
+                     options::OPT_no_gpu_bundle_output, true) ||
+       Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)))
     return C.MakeAction<OffloadAction>(DDeps, types::TY_Nothing);
 
   if (OffloadActions.empty())
@@ -4671,6 +4676,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
              nullptr, C.getActiveOffloadKinds());
   }
 
+  // HIP wants '--offload-device-only' to create a fatbinary by default.
+  if (offloadDeviceOnly())
+    return C.MakeAction<OffloadAction>(DDep, types::TY_Nothing);
+
   // If we are unable to embed a single device output into the host, we need to
   // add each device output as a host dependency to ensure they are still built.
   bool SingleDeviceOutput = !llvm::any_of(OffloadActions, [](Action *A) {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index ebe295f160b2a..984f0cf917e99 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -410,8 +410,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
 }
 
 /// Compares each of the corresponding double-precision values of the
-///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
-///    for false, 0xFFFFFFFFFFFFFFFF for true.
+///    128-bit vectors of [2 x double] for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -429,8 +430,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
 
 /// Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are less than those in the second operand. Each comparison
-///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    operand are less than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -949,8 +951,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -962,8 +964,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
@@ -974,8 +975,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -987,8 +988,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
@@ -999,8 +999,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1012,8 +1012,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
@@ -1024,8 +1023,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1037,8 +1036,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
@@ -1049,8 +1047,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1062,8 +1060,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
                                                        __m128d __b) {
   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
@@ -1074,7 +1071,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two
+///    The comparison returns 0 for false, 1 for true. If either of the two
 ///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
@@ -1087,18 +1084,17 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
 }
 
 /// Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0 for false, 1 for true.
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    If either of the two lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1110,8 +1106,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
@@ -1122,8 +1117,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1135,8 +1130,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
@@ -1147,8 +1141,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1160,8 +1154,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
@@ -1172,8 +1165,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1185,8 +1178,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
 /// \param __b
 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
 ///     compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
@@ -1197,8 +1189,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true.  If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower double-precision values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1210,8 +1202,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower double-precision values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
                                                         __m128d __b) {
   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
@@ -1222,8 +1213,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1235,8 +1226,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
 /// \param __b
 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
 ///    compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison result. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+/// \returns An integer containing the comparison result.
 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
                                                          __m128d __b) {
   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
@@ -3023,8 +3013,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
 }
 
 /// Compares each of the corresponding 8-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
-///    for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3041,8 +3032,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
 }
 
 /// Compares each of the corresponding 16-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false,
-///    0xFFFF for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3059,8 +3051,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
 }
 
 /// Compares each of the corresponding 32-bit values of the 128-bit
-///    integer vectors for equality. Each comparison yields 0x0 for false,
-///    0xFFFFFFFF for true.
+///    integer vectors for equality.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3078,8 +3071,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
 
 /// Compares each of the corresponding signed 8-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are
-///    greater than those in the second operand. Each comparison yields 0x0 for
-///    false, 0xFF for true.
+///    greater than those in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index c52ffb77e33d5..9fb9cc9b01348 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1188,6 +1188,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
 /// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
@@ -2301,6 +2303,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 1f5993e0c368d..8e386a72cde78 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -474,7 +474,9 @@ _mm_xor_ps(__m128 __a, __m128 __b)
 }
 
 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison in the
+///    operands for equality.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@@ -498,6 +500,8 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for equality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
@@ -515,8 +519,10 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than the
-///    corresponding value in the second operand and returns the result of the
-///    comparison in the low-order bits of a vector of [4 x float].
+///    corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -540,6 +546,8 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@@ -557,9 +565,10 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is less than or
-///    equal to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    equal to the corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
+///    the low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -583,6 +592,8 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are less than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@@ -600,8 +611,10 @@ _mm_cmple_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -627,6 +640,8 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
@@ -644,9 +659,10 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is greater than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -672,6 +688,8 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are greater than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
@@ -687,8 +705,10 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
 }
 
-/// Compares two 32-bit float values in the low-order bits of both
-///    operands for inequality and returns the result of the comparison in the
+/// Compares two 32-bit float values in the low-order bits of both operands
+///    for inequality.
+///
+///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@@ -713,6 +733,8 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
 /// Compares each of the corresponding 32-bit float values of the
 ///    128-bit vectors of [4 x float] for inequality.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
@@ -731,8 +753,10 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    the corresponding value in the second operand and returns the result of
-///    the comparison in the low-order bits of a vector of [4 x float].
+///    the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -757,6 +781,8 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@@ -775,9 +801,10 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not less than
-///    or equal to the corresponding value in the second operand and returns
-///    the result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -802,6 +829,8 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not less than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@@ -820,9 +849,10 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    than the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -849,6 +879,8 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
@@ -867,9 +899,10 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is not greater
-///    than or equal to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    than or equal to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -896,6 +929,8 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
@@ -914,9 +949,10 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is ordered with
-///    respect to the corresponding value in the second operand and returns the
-///    result of the comparison in the low-order bits of a vector of
-///    [4 x float].
+///    respect to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -941,6 +977,8 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
@@ -959,9 +997,10 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the value in the first operand is unordered
-///    with respect to the corresponding value in the second operand and
-///    returns the result of the comparison in the low-order bits of a vector
-///    of [4 x float].
+///    with respect to the corresponding value in the second operand.
+///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
+///    low-order bits of a vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -986,6 +1025,8 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
+///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
@@ -1003,9 +1044,10 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 }
 
 /// Compares two 32-bit float values in the low-order bits of both
-///    operands for equality and returns the result of the comparison.
+///    operands for equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1018,8 +1060,7 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///    two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
@@ -1028,9 +1069,10 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1043,8 +1085,7 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
@@ -1053,9 +1094,10 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is less than or equal to the
-///    second operand and returns the result of the comparison.
+///    second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1067,8 +1109,7 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
@@ -1077,9 +1118,10 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1091,8 +1133,7 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
@@ -1101,9 +1142,10 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is greater than or equal to
-///    the second operand and returns the result of the comparison.
+///    the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1115,8 +1157,7 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
@@ -1125,9 +1166,10 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands to determine if the first operand is not equal to the second
-///    operand and returns the result of the comparison.
+///    operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1139,8 +1181,7 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-///     two lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
@@ -1148,10 +1189,10 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 }
 
 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine equality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine equality.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1163,8 +1204,7 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
@@ -1173,9 +1213,10 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than the second operand and returns the result of the comparison.
+///    less than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1187,8 +1228,7 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
@@ -1197,10 +1237,10 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    less than or equal to the second operand and returns the result of the
-///    comparison.
+///    less than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1212,8 +1252,7 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
@@ -1222,10 +1261,10 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than the second operand and returns the result of the
-///    comparison.
+///    greater than the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1237,8 +1276,7 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
@@ -1247,10 +1285,10 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine if the first operand is
-///    greater than or equal to the second operand and returns the result of
-///    the comparison.
+///    greater than or equal to the second operand.
 ///
-///    If either of the two lower 32-bit values is NaN, 0 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1262,8 +1300,7 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///     lower 32-bit values is NaN, 0 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
@@ -1271,10 +1308,10 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 }
 
 /// Performs an unordered comparison of two 32-bit float values using
-///    the low-order bits of both operands to determine inequality and returns
-///    the result of the comparison.
+///    the low-order bits of both operands to determine inequality.
 ///
-///    If either of the two lower 32-bit values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true.  If either of the two
+///    lower floating-point values is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1286,8 +1323,7 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 /// \param __b
 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
 ///    used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-///    lower 32-bit values is NaN, 1 is returned.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 1edbdf5bb9836..0d526fe1da666 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -137,9 +137,9 @@ std::unique_ptr<MemoryBuffer> createInputBuffer(InstallAPIContext &Ctx) {
     else
       OS << "#import ";
     if (H.useIncludeName())
-      OS << "<" << H.getIncludeName() << ">";
+      OS << "<" << H.getIncludeName() << ">\n";
     else
-      OS << "\"" << H.getPath() << "\"";
+      OS << "\"" << H.getPath() << "\"\n";
 
     Ctx.addKnownHeader(H);
   }
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index 1f2ef08e5aa25..aded94f7a94a3 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/InstallAPI/Visitor.h"
+#include "clang/AST/Availability.h"
 #include "clang/AST/ParentMapContext.h"
+#include "clang/AST/VTableBuilder.h"
 #include "clang/Basic/Linkage.h"
 #include "clang/InstallAPI/Frontend.h"
 #include "llvm/ADT/SmallString.h"
@@ -18,6 +20,15 @@
 using namespace llvm;
 using namespace llvm::MachO;
 
+namespace {
+enum class CXXLinkage {
+  ExternalLinkage,
+  LinkOnceODRLinkage,
+  WeakODRLinkage,
+  PrivateLinkage,
+};
+}
+
 namespace clang::installapi {
 
 // Exported NamedDecl needs to have external linkage and
@@ -53,7 +64,7 @@ static bool isInlined(const FunctionDecl *D) {
   return true;
 }
 
-static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal) {
+static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal = false) {
   SymbolFlags Result = SymbolFlags::None;
   if (WeakDef)
     Result |= SymbolFlags::WeakDefined;
@@ -277,8 +288,417 @@ bool InstallAPIVisitor::VisitFunctionDecl(const FunctionDecl *D) {
                                     ? RecordLinkage::Internal
                                     : RecordLinkage::Exported;
   Ctx.Slice->addGlobal(Name, Linkage, GlobalRecord::Kind::Function, Avail, D,
-                       *Access, getFlags(WeakDef, /*ThreadLocal=*/false),
-                       Inlined);
+                       *Access, getFlags(WeakDef), Inlined);
+  return true;
+}
+
+static bool hasVTable(const CXXRecordDecl *D) {
+  // Check if vtable symbols should be emitted, only dynamic classes need
+  // vtables.
+  if (!D->hasDefinition() || !D->isDynamicClass())
+    return false;
+
+  assert(D->isExternallyVisible() && "Should be externally visible");
+  assert(D->isCompleteDefinition() && "Only works on complete definitions");
+
+  const CXXMethodDecl *KeyFunctionD =
+      D->getASTContext().getCurrentKeyFunction(D);
+  // If this class has a key function, then there is a vtable, possibly internal
+  // though.
+  if (KeyFunctionD) {
+    switch (KeyFunctionD->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+    case TSK_ImplicitInstantiation:
+    case TSK_ExplicitInstantiationDefinition:
+      return true;
+    case TSK_ExplicitInstantiationDeclaration:
+      llvm_unreachable(
+          "Unexpected TemplateSpecializationKind for key function");
+    }
+  } else if (D->isAbstract()) {
+    // If the class is abstract and it doesn't have a key function, it is a
+    // 'pure' virtual class. It doesn't need a vtable.
+    return false;
+  }
+
+  switch (D->getTemplateSpecializationKind()) {
+  case TSK_Undeclared:
+  case TSK_ExplicitSpecialization:
+  case TSK_ImplicitInstantiation:
+    return false;
+
+  case TSK_ExplicitInstantiationDeclaration:
+  case TSK_ExplicitInstantiationDefinition:
+    return true;
+  }
+
+  llvm_unreachable("Invalid TemplateSpecializationKind!");
+}
+
+static CXXLinkage getVTableLinkage(const CXXRecordDecl *D) {
+  assert((D->hasDefinition() && D->isDynamicClass()) && "Record has no vtable");
+  assert(D->isExternallyVisible() && "Record should be externally visible");
+  if (D->getVisibility() == HiddenVisibility)
+    return CXXLinkage::PrivateLinkage;
+
+  const CXXMethodDecl *KeyFunctionD =
+      D->getASTContext().getCurrentKeyFunction(D);
+  if (KeyFunctionD) {
+    // If this class has a key function, use that to determine the
+    // linkage of the vtable.
+    switch (KeyFunctionD->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+      if (isInlined(KeyFunctionD))
+        return CXXLinkage::LinkOnceODRLinkage;
+      return CXXLinkage::ExternalLinkage;
+    case TSK_ImplicitInstantiation:
+      llvm_unreachable("No external vtable for implicit instantiations");
+    case TSK_ExplicitInstantiationDefinition:
+      return CXXLinkage::WeakODRLinkage;
+    case TSK_ExplicitInstantiationDeclaration:
+      llvm_unreachable(
+          "Unexpected TemplateSpecializationKind for key function");
+    }
+  }
+
+  switch (D->getTemplateSpecializationKind()) {
+  case TSK_Undeclared:
+  case TSK_ExplicitSpecialization:
+  case TSK_ImplicitInstantiation:
+    return CXXLinkage::LinkOnceODRLinkage;
+  case TSK_ExplicitInstantiationDeclaration:
+  case TSK_ExplicitInstantiationDefinition:
+    return CXXLinkage::WeakODRLinkage;
+  }
+
+  llvm_unreachable("Invalid TemplateSpecializationKind!");
+}
+
+static bool isRTTIWeakDef(const CXXRecordDecl *D) {
+  if (D->hasAttr<WeakAttr>())
+    return true;
+
+  if (D->isAbstract() && D->getASTContext().getCurrentKeyFunction(D) == nullptr)
+    return true;
+
+  if (D->isDynamicClass())
+    return getVTableLinkage(D) != CXXLinkage::ExternalLinkage;
+
+  return false;
+}
+
+static bool hasRTTI(const CXXRecordDecl *D) {
+  if (!D->getASTContext().getLangOpts().RTTI)
+    return false;
+
+  if (!D->hasDefinition())
+    return false;
+
+  if (!D->isDynamicClass())
+    return false;
+
+  // Don't emit weak-def RTTI information. InstallAPI cannot reliably determine
+  // if the final binary will have those weak defined RTTI symbols. This depends
+  // on the optimization level and if the class has been instantiated and used.
+  //
+  // Luckily, the Apple static linker doesn't need those weak defined RTTI
+  // symbols for linking. They are only needed by the runtime linker. That means
+  // they can be safely dropped.
+  if (isRTTIWeakDef(D))
+    return false;
+
+  return true;
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXRTTIName(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXRTTIName(QualType(D->getTypeForDecl(), 0), NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getMangledCXXRTTI(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXRTTI(QualType(D->getTypeForDecl(), 0), NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXVTableName(const CXXRecordDecl *D) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  MC->mangleCXXVTable(D, NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string
+InstallAPIVisitor::getMangledCXXThunk(const GlobalDecl &D,
+                                      const ThunkInfo &Thunk) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  const auto *Method = cast<CXXMethodDecl>(D.getDecl());
+  if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method))
+    MC->mangleCXXDtorThunk(Dtor, D.getDtorType(), Thunk.This, NameStream);
+  else
+    MC->mangleThunk(Method, Thunk, NameStream);
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getMangledCtorDtor(const CXXMethodDecl *D,
+                                                  int Type) const {
+  SmallString<256> Name;
+  raw_svector_ostream NameStream(Name);
+  GlobalDecl GD;
+  if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(D))
+    GD = GlobalDecl(Ctor, CXXCtorType(Type));
+  else {
+    const auto *Dtor = cast<CXXDestructorDecl>(D);
+    GD = GlobalDecl(Dtor, CXXDtorType(Type));
+  }
+  MC->mangleName(GD, NameStream);
+  return getBackendMangledName(Name);
+}
+
+void InstallAPIVisitor::emitVTableSymbols(const CXXRecordDecl *D,
+                                          const AvailabilityInfo &Avail,
+                                          const HeaderType Access,
+                                          bool EmittedVTable) {
+  if (hasVTable(D)) {
+    EmittedVTable = true;
+    const CXXLinkage VTableLinkage = getVTableLinkage(D);
+    if (VTableLinkage == CXXLinkage::ExternalLinkage ||
+        VTableLinkage == CXXLinkage::WeakODRLinkage) {
+      const std::string Name = getMangledCXXVTableName(D);
+      const bool WeakDef = VTableLinkage == CXXLinkage::WeakODRLinkage;
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Variable, Avail, D, Access,
+                           getFlags(WeakDef));
+      if (!D->getDescribedClassTemplate() && !D->isInvalidDecl()) {
+        VTableContextBase *VTable = D->getASTContext().getVTableContext();
+        auto AddThunk = [&](GlobalDecl GD) {
+          const ItaniumVTableContext::ThunkInfoVectorTy *Thunks =
+              VTable->getThunkInfo(GD);
+          if (!Thunks)
+            return;
+
+          for (const auto &Thunk : *Thunks) {
+            const std::string Name = getMangledCXXThunk(GD, Thunk);
+            Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                                 GlobalRecord::Kind::Function, Avail,
+                                 GD.getDecl(), Access);
+          }
+        };
+
+        for (const auto *Method : D->methods()) {
+          if (isa<CXXConstructorDecl>(Method) || !Method->isVirtual())
+            continue;
+
+          if (auto Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
+            // Skip default destructor.
+            if (Dtor->isDefaulted())
+              continue;
+            AddThunk({Dtor, Dtor_Deleting});
+            AddThunk({Dtor, Dtor_Complete});
+          } else
+            AddThunk(Method);
+        }
+      }
+    }
+  }
+
+  if (!EmittedVTable)
+    return;
+
+  if (hasRTTI(D)) {
+    std::string Name = getMangledCXXRTTI(D);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, Access);
+
+    Name = getMangledCXXRTTIName(D);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, Access);
+  }
+
+  for (const auto &It : D->bases()) {
+    const CXXRecordDecl *Base =
+        cast<CXXRecordDecl>(It.getType()->castAs<RecordType>()->getDecl());
+    const auto BaseAccess = getAccessForDecl(Base);
+    if (!BaseAccess)
+      continue;
+    const AvailabilityInfo BaseAvail = AvailabilityInfo::createFromDecl(Base);
+    emitVTableSymbols(Base, BaseAvail, *BaseAccess, /*EmittedVTable=*/true);
+  }
+}
+
+bool InstallAPIVisitor::VisitCXXRecordDecl(const CXXRecordDecl *D) {
+  if (!D->isCompleteDefinition())
+    return true;
+
+  // Skip templated classes.
+  if (D->getDescribedClassTemplate() != nullptr)
+    return true;
+
+  // Skip partial templated classes too.
+  if (isa<ClassTemplatePartialSpecializationDecl>(D))
+    return true;
+
+  auto Access = getAccessForDecl(D);
+  if (!Access)
+    return true;
+  const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(D);
+
+  // Check whether to emit the vtable/rtti symbols.
+  if (isExported(D))
+    emitVTableSymbols(D, Avail, *Access);
+
+  TemplateSpecializationKind ClassSK = TSK_Undeclared;
+  bool KeepInlineAsWeak = false;
+  if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
+    ClassSK = Templ->getTemplateSpecializationKind();
+    if (ClassSK == TSK_ExplicitInstantiationDeclaration)
+      KeepInlineAsWeak = true;
+  }
+
+  // Record the class methods.
+  for (const auto *M : D->methods()) {
+    // Inlined methods are usually not emitted, except when it comes from a
+    // specialized template.
+    bool WeakDef = false;
+    if (isInlined(M)) {
+      if (!KeepInlineAsWeak)
+        continue;
+
+      WeakDef = true;
+    }
+
+    if (!isExported(M))
+      continue;
+
+    switch (M->getTemplateSpecializationKind()) {
+    case TSK_Undeclared:
+    case TSK_ExplicitSpecialization:
+      break;
+    case TSK_ImplicitInstantiation:
+      continue;
+    case TSK_ExplicitInstantiationDeclaration:
+      if (ClassSK == TSK_ExplicitInstantiationDeclaration)
+        WeakDef = true;
+      break;
+    case TSK_ExplicitInstantiationDefinition:
+      WeakDef = true;
+      break;
+    }
+
+    if (!M->isUserProvided())
+      continue;
+
+    // Methods that are deleted are not exported.
+    if (M->isDeleted())
+      continue;
+
+    const auto Access = getAccessForDecl(M);
+    if (!Access)
+      return true;
+    const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(M);
+
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(M)) {
+      // Defaulted constructors are not exported.
+      if (Ctor->isDefaulted())
+        continue;
+
+      std::string Name = getMangledCtorDtor(M, Ctor_Base);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      if (!D->isAbstract()) {
+        std::string Name = getMangledCtorDtor(M, Ctor_Complete);
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Function, Avail, D, *Access,
+                             getFlags(WeakDef));
+      }
+
+      continue;
+    }
+
+    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(M)) {
+      // Defaulted destructors are not exported.
+      if (Dtor->isDefaulted())
+        continue;
+
+      std::string Name = getMangledCtorDtor(M, Dtor_Base);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      Name = getMangledCtorDtor(M, Dtor_Complete);
+      Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                           GlobalRecord::Kind::Function, Avail, D, *Access,
+                           getFlags(WeakDef));
+
+      if (Dtor->isVirtual()) {
+        Name = getMangledCtorDtor(M, Dtor_Deleting);
+        Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                             GlobalRecord::Kind::Function, Avail, D, *Access,
+                             getFlags(WeakDef));
+      }
+
+      continue;
+    }
+
+    // Though abstract methods can map to exports, this is generally unexpected.
+    // Except in the case of destructors. Only ignore pure virtuals after
+    // checking if the member function was a destructor.
+    if (M->isPureVirtual())
+      continue;
+
+    std::string Name = getMangledName(M);
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Function, Avail, D, *Access,
+                         getFlags(WeakDef));
+  }
+
+  if (auto *Templ = dyn_cast<ClassTemplateSpecializationDecl>(D)) {
+    if (!Templ->isExplicitInstantiationOrSpecialization())
+      return true;
+  }
+
+  using var_iter = CXXRecordDecl::specific_decl_iterator<VarDecl>;
+  using var_range = iterator_range<var_iter>;
+  for (const auto *Var : var_range(D->decls())) {
+    // Skip const static member variables.
+    // \code
+    // struct S {
+    //   static const int x = 0;
+    // };
+    // \endcode
+    if (Var->isStaticDataMember() && Var->hasInit())
+      continue;
+
+    // Skip unexported var decls.
+    if (!isExported(Var))
+      continue;
+
+    const std::string Name = getMangledName(Var);
+    const auto Access = getAccessForDecl(Var);
+    if (!Access)
+      return true;
+    const AvailabilityInfo Avail = AvailabilityInfo::createFromDecl(Var);
+    const bool WeakDef = Var->hasAttr<WeakAttr>() || KeepInlineAsWeak;
+
+    Ctx.Slice->addGlobal(Name, RecordLinkage::Exported,
+                         GlobalRecord::Kind::Variable, Avail, D, *Access,
+                         getFlags(WeakDef));
+  }
+
   return true;
 }
 
diff --git a/clang/test/AST/Interp/complex.c b/clang/test/AST/Interp/complex.c
index c9c2efb597453..b5f30b87baa79 100644
--- a/clang/test/AST/Interp/complex.c
+++ b/clang/test/AST/Interp/complex.c
@@ -14,3 +14,8 @@ void blah() {
 _Static_assert((0.0 + 0.0j) == (0.0 + 0.0j), "");
 _Static_assert((0.0 + 0.0j) != (0.0 + 0.0j), ""); // both-error {{static assertion}} \
                                                   // both-note {{evaluates to}}
+
+const _Complex float FC = {0.0f, 0.0f};
+_Static_assert(!FC, "");
+const _Complex float FI = {0, 0};
+_Static_assert(!FI, "");
diff --git a/clang/test/AST/ast-print-objectivec.m b/clang/test/AST/ast-print-objectivec.m
index 05a0a5d4aa74c..a0652f38e713f 100644
--- a/clang/test/AST/ast-print-objectivec.m
+++ b/clang/test/AST/ast-print-objectivec.m
@@ -21,6 +21,10 @@ - (void)MethI __attribute__((availability(macosx,introduced=10.1.0,deprecated=10
 - (void)methodWithArg:(int)x andAnotherOne:(int)y { }
 @end
 
+__attribute__((availability(macosx,introduced=10.1.0,deprecated=10.2)))
+@interface InterfaceWithAttribute
+@end
+
 // CHECK: @protocol P
 // CHECK: - (void)MethP __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)));
 // CHECK: @end
@@ -45,6 +49,10 @@ - (void)methodWithArg:(int)x andAnotherOne:(int)y { }
 
 // CHECK: @end
 
+// CHECK: __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)))
+// CHECK: @interface InterfaceWithAttribute
+// CHECK: @end
+
 @class C1;
 struct __attribute__((objc_bridge_related(C1,,))) S1;
 
diff --git a/clang/test/CodeGen/remote-traps.c b/clang/test/CodeGen/remote-traps.c
new file mode 100644
index 0000000000000..6751afb96d25f
--- /dev/null
+++ b/clang/test/CodeGen/remote-traps.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow %s -o - | FileCheck %s 
+// RUN: %clang_cc1 -O1 -emit-llvm -fsanitize=signed-integer-overflow -fsanitize-trap=signed-integer-overflow -mllvm -clang-remove-traps -mllvm -remove-traps-random-rate=1 %s -o - | FileCheck %s --implicit-check-not="call void @llvm.ubsantrap" --check-prefixes=REMOVE
+
+int test(int x) {
+  return x + 123;
+}
+
+// CHECK-LABEL: define {{.*}}i32 @test(
+// CHECK: call { i32, i1 } @llvm.sadd.with.overflow.i32(
+// CHECK: trap:
+// CHECK-NEXT: call void @llvm.ubsantrap(i8 0)
+// CHECK-NEXT: unreachable
+
+// REMOVE-LABEL: define {{.*}}i32 @test(
+// REMOVE: call { i32, i1 } @llvm.sadd.with.overflow.i32(
diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index 63e4097946448..9b4b7415142d9 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -191,7 +191,7 @@ void copy12(UnionMember2 *a1, UnionMember2 *a2) {
 // (offset, size) = (0,1) char; (4,2) short; (8,4) int; (12,1) char; (16,4) int; (20,4) int
 // CHECK-OLD: [[TS2]] = !{i64 0, i64 1, !{{.*}}, i64 4, i64 2, !{{.*}}, i64 8, i64 4, !{{.*}}, i64 12, i64 1, !{{.*}}, i64 16, i64 4, {{.*}}, i64 20, i64 4, {{.*}}}
 // (offset, size) = (0,8) char; (0,2) char; (4,8) char
-// CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
+// CHECK-OLD: [[TS3]] = !{i64 0, i64 12, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
@@ -199,10 +199,8 @@ void copy12(UnionMember2 *a1, UnionMember2 *a2) {
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
 // CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]}
-// CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_DOUBLE]], i64 0, i64 4, [[TAG_FLOAT:!.+]], i64 8, i64 4, [[TAG_INT]]}
-// CHECK-OLD: [[TAG_FLOAT]]  = !{[[FLOAT:!.+]], [[FLOAT]], i64 0}
-// CHECK-OLD: [[FLOAT]] = !{!"float", [[CHAR]], i64 0}
-// CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_DOUBLE]], i64 8, i64 4, [[TAG_FLOAT:!.+]]}
+// CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_CHAR]], i64 8, i64 4, [[TAG_INT]]}
+// CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_CHAR]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 79ec2039edb74..c116ad80a8ad8 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -65,9 +65,18 @@
 // MULTI-D-ONLY-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX90a]]"], output: "[[GFX90a_OUT:.+]]"
 //
 // RUN: not %clang -### --target=x86_64-linux-gnu --offload-new-driver -ccc-print-bindings -nogpulib -nogpuinc \
-// RUN:        --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o %t %s 2>&1 \
+// RUN:        --no-gpu-bundle-output --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o %t %s 2>&1 \
+// RUN: | FileCheck -check-prefix=MULTI-D-ONLY-NO-BUNDLE-O %s
+// MULTI-D-ONLY-NO-BUNDLE-O: error: cannot specify -o when generating multiple output files
+
+// RUN: %clang -### --target=x86_64-linux-gnu --offload-new-driver -ccc-print-bindings -nogpulib -nogpuinc \
+// RUN:        --gpu-bundle-output --offload-arch=gfx90a --offload-arch=gfx908 --offload-device-only -c -o a.out %s 2>&1 \
 // RUN: | FileCheck -check-prefix=MULTI-D-ONLY-O %s
-// MULTI-D-ONLY-O: error: cannot specify -o when generating multiple output files
+//      MULTI-D-ONLY-O: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[GFX908_OBJ:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX908_OBJ]]"], output: "[[GFX908:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[GFX90A_OBJ:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX90A_OBJ]]"], output: "[[GFX90A:.+]]"
+// MULTI-D-ONLY-O-NEXT: "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[GFX908]]", "[[GFX90A]]"], output: "a.out"
 
 //
 // Check to ensure that we can use '-fsyntax-only' for HIP output with the new
diff --git a/clang/test/InstallAPI/cpp.test b/clang/test/InstallAPI/cpp.test
new file mode 100644
index 0000000000000..4817899095302
--- /dev/null
+++ b/clang/test/InstallAPI/cpp.test
@@ -0,0 +1,530 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+// Invoke C++ with no-rtti.
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include -I%t/usr/local/include -x c++ \
+// RUN: -install_name @rpath/lib/libcpp.dylib -fno-rtti \
+// RUN: %t/inputs.json -o %t/no-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// RUN: llvm-readtapi -compare %t/no-rtti.tbd \
+// RUN: %t/expected-no-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// Invoke C++ with rtti.
+// RUN: clang-installapi -target arm64-apple-macos13.1 \
+// RUN: -I%t/usr/include -I%t/usr/local/include -x c++ \
+// RUN: -install_name @rpath/lib/libcpp.dylib -frtti \
+// RUN: %t/inputs.json -o %t/rtti.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/rtti.tbd \
+// RUN: %t/expected-rtti.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error: 
+// CHECK-NOT: warning: 
+
+//--- usr/include/basic.h
+#ifndef CPP_H
+#define CPP_H
+
+inline int foo(int x) { return x + 1; }
+
+extern int bar(int x) { return x + 1; }
+
+inline int baz(int x) {
+  static const int a[] = {1, 2, 3};
+  return a[x];
+}
+
+extern "C" {
+  int cFunc(const char*);
+}
+
+class Bar {
+public:
+  static const int x = 0;
+  static int y;
+
+  inline int func1(int x) { return x + 2; }
+  inline int func2(int x);
+  int func3(int x);
+};
+
+class __attribute__((visibility("hidden"))) BarI {
+  static const int x = 0;
+  static int y;
+
+  inline int func1(int x) { return x + 2; }
+  inline int func2(int x);
+  int func3(int x);
+};
+
+int Bar::func2(int x) { return x + 3; }
+inline int Bar::func3(int x) { return x + 4; }
+
+int BarI::func2(int x) { return x + 3; }
+inline int BarI::func3(int x) { return x + 4; }
+#endif
+
+//--- usr/local/include/vtable.h
+// Simple test class with no virtual functions. There should be no vtable or
+// RTTI.
+namespace test1 {
+class Simple {
+public:
+  void run();
+};
+} // end namespace test1
+
+// Simple test class with virtual function. There should be an external vtable
+// and RTTI.
+namespace test2 {
+class Simple {
+public:
+  virtual void run();
+};
+} // end namespace test2
+
+// Abstract class with no sub classes. There should be no vtable or RTTI.
+namespace test3 {
+class Abstract {
+public:
+  virtual ~Abstract() {}
+  virtual void run() = 0;
+};
+} // end namespace test3
+
+// Abstract base class with a sub class. There should be weak-def RTTI for the
+// abstract base class.
+// The sub-class should have vtable and RTTI.
+namespace test4 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run() override;
+};
+} // end namespace test4
+
+// Abstract base class with a sub class. Same as above, but with a user defined
+// inlined destructor.
+namespace test5 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  virtual ~Sub() {}
+  void run() override;
+};
+} // end namespace test5
+
+// Abstract base class with a sub class. Same as above, but with a different
+// inlined key method.
+namespace test6 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  virtual void foo() {}
+  void run() override;
+};
+} // end namespace test6
+
+// Abstract base class with a sub class. Overloaded method is implemented
+// inline. No vtable or RTTI.
+namespace test7 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual bool run() = 0;
+};
+
+class Sub : public Base {
+public:
+  bool run() override { return true; }
+};
+} // end namespace test7
+
+// Abstract base class with a sub class. Overloaded method has no inline
+// attribute and is recognized as key method,
+// but is later implemented inline. Weak-def RTTI only.
+namespace test8 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run() override;
+};
+
+inline void Sub::run() {}
+} // end namespace test8
+
+namespace test9 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override;
+};
+
+inline void Sub::run2() {}
+} // end namespace test9
+
+namespace test10 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  inline void run2() override;
+};
+
+void Sub::run2() {}
+} // end namespace test10
+
+namespace test11 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() = 0;
+  virtual void run3() = 0;
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override;
+  void run3() override;
+};
+
+inline void Sub::run2() {}
+} // end namespace test11
+
+namespace test12 {
+template <class T> class Simple {
+public:
+  virtual void foo() {}
+};
+extern template class Simple<int>;
+} // end namespace test12
+
+namespace test13 {
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run1() = 0;
+  virtual void run2() {};
+  virtual void run3(); // key function.
+};
+
+class Sub : public Base {
+public:
+  void run1() override {}
+  void run2() override {}
+};
+
+} // end namespace test13
+
+namespace test14 {
+
+class __attribute__((visibility("hidden"))) Base
+{
+public:
+    Base() {}
+    virtual ~Base(); // keyfunction.
+    virtual void run1() const = 0;
+};
+
+class Sub : public Base
+{
+public:
+    Sub();
+    virtual ~Sub();
+    virtual void run1() const;
+    void run2() const {}
+};
+
+} // end namespace test14
+
+namespace test15 {
+
+class Base {
+public:
+  virtual ~Base() {}
+  virtual void run() {};
+};
+
+class Base1 {
+public:
+  virtual ~Base1() {}
+  virtual void run1() {};
+};
+
+class Sub : public Base, public Base1 {
+public:
+  Sub() {}
+  ~Sub();
+  void run() override;
+  void run1() override;
+};
+
+class Sub1 : public Base, public Base1 {
+public:
+  Sub1() {}
+  ~Sub1() = default;
+  void run() override;
+  void run1() override;
+};
+
+} // end namespace test15
+
+//--- usr/local/include/templates.h
+#ifndef TEMPLATES_H
+#define TEMPLATES_H
+
+namespace templates {
+
+// Full specialization.
+template <class T> int foo1(T a) { return 1; }
+template <> int foo1<int>(int a);
+extern template int foo1<short>(short a);
+
+template <class T> int foo2(T a);
+
+// Partial specialization.
+template <class A, class B> class Partial {
+  static int run(A a, B b) { return a + b; }
+};
+
+template <class A> class Partial<A, int> {
+  static int run(A a, int b) { return a - b; }
+};
+
+template <class T> class Foo {
+public:
+  Foo();
+  ~Foo();
+};
+
+template <class T> class Bar {
+public:
+  Bar();
+  ~Bar() {}
+
+  inline int bazinga() { return 7; }
+};
+
+extern template class Bar<int>;
+
+class Bazz {
+public:
+  Bazz() {}
+
+  template <class T> int buzz(T a);
+
+  float implicit() const { return foo1(0.0f); }
+};
+
+template <class T> int Bazz::buzz(T a) { return sizeof(T); }
+
+template <class T> struct S { static int x; };
+
+template <class T> int S<T>::x = 0;
+
+} // end namespace templates.
+
+#endif
+
+
+//--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/basic.h",
+    "type" : "public"
+  }, 
+  {
+    "path" : "DSTROOT/usr/local/include/vtable.h",
+    "type" : "private"
+  },
+  {
+    "path" : "DSTROOT/usr/local/include/templates.h",
+    "type" : "private"
+  }
+  ],
+  "version": "3"
+}
+
+//--- expected-no-rtti.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "__ZTVN6test143SubE", "__ZTVN6test113SubE", "__ZTVN5test26SimpleE",
+            "__ZTVN5test53SubE", "__ZTVN6test154Sub1E", "__ZTVN6test153SubE",
+            "__ZN3Bar1yE", "__ZTVN5test43SubE", "__ZTVN5test63SubE",
+            "__ZTVN6test134BaseE"
+          ],
+          "weak": [
+            "__ZTVN6test126SimpleIiEE"
+          ]
+        },
+        "text": {
+          "global": [
+            "__ZN6test153Sub3runEv", "__ZN6test154Sub13runEv",
+            "__Z3bari", "__ZThn8_N6test153SubD1Ev",
+            "__ZNK6test143Sub4run1Ev", "__ZN6test154Sub14run1Ev",
+            "__ZThn8_N6test153Sub4run1Ev", "__ZN6test143SubD1Ev",
+            "__ZN6test134Base4run3Ev", "__ZN5test16Simple3runEv",
+            "__ZN5test43Sub3runEv", "__ZN6test113Sub4run3Ev", "__ZN6test153SubD2Ev",
+            "__ZN5test53Sub3runEv", "__ZN6test153SubD1Ev", "__ZN6test143SubC1Ev",
+            "__ZN9templates4foo1IiEEiT_", "__ZN6test143SubC2Ev", "__ZN5test63Sub3runEv",
+            "__ZN5test26Simple3runEv", "__ZN6test153SubD0Ev",
+            "__ZN6test143SubD2Ev", "__ZN6test153Sub4run1Ev", "__ZN6test143SubD0Ev",
+            "__ZThn8_N6test153SubD0Ev", "__ZThn8_N6test154Sub14run1Ev", "_cFunc"
+          ],
+          "weak": [
+            "__ZN9templates3BarIiED2Ev", "__ZN9templates3BarIiEC2Ev",
+            "__ZN9templates3BarIiEC1Ev", "__ZN9templates3BarIiED1Ev",
+            "__ZN6test126SimpleIiE3fooEv", "__ZN9templates3BarIiE7bazingaEv",
+            "__ZN9templates4foo1IsEEiT_"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libcpp.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
+//--- expected-rtti.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "current_versions": [
+      {
+        "version": "0"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "__ZTVN6test143SubE", "__ZTIN5test63SubE", "__ZTSN5test26SimpleE",
+            "__ZTIN6test153SubE", "__ZTVN6test113SubE", "__ZTIN5test43SubE",
+            "__ZTIN6test134BaseE", "__ZTVN5test26SimpleE", "__ZTIN5test26SimpleE",
+            "__ZTSN6test134BaseE", "__ZTVN6test154Sub1E", "__ZTVN5test43SubE",
+            "__ZTVN5test63SubE", "__ZTSN5test43SubE", "__ZTSN6test113SubE",
+            "__ZTIN6test154Sub1E", "__ZTSN6test153SubE", "__ZTSN5test63SubE",
+            "__ZTSN6test154Sub1E", "__ZTIN6test113SubE", "__ZTSN6test143SubE",
+            "__ZTVN5test53SubE", "__ZTIN6test143SubE", "__ZTVN6test153SubE",
+            "__ZTIN5test53SubE", "__ZN3Bar1yE", "__ZTVN6test134BaseE",
+            "__ZTSN5test53SubE"
+          ],
+          "weak": [
+            "__ZTVN6test126SimpleIiEE"
+          ]
+        },
+        "text": {
+          "global": [
+            "__ZN6test154Sub13runEv", "__ZN6test153Sub3runEv", "__ZNK6test143Sub4run1Ev",
+            "__ZN6test134Base4run3Ev", "__ZN5test16Simple3runEv", "__ZN6test153SubD2Ev",
+            "__ZN6test143SubC2Ev", "__ZN5test63Sub3runEv", "__ZN6test153SubD0Ev", 
+            "__ZN6test143SubD2Ev", "__ZThn8_N6test154Sub14run1Ev",
+            "__ZThn8_N6test153SubD0Ev", "__Z3bari", "__ZThn8_N6test153SubD1Ev",
+            "__ZN6test154Sub14run1Ev", "__ZThn8_N6test153Sub4run1Ev",
+            "__ZN6test143SubD1Ev", "__ZN5test43Sub3runEv",
+            "__ZN6test113Sub4run3Ev", "__ZN5test53Sub3runEv", "__ZN6test143SubC1Ev",
+            "__ZN6test153SubD1Ev", "__ZN9templates4foo1IiEEiT_", "__ZN5test26Simple3runEv",
+            "__ZN6test153Sub4run1Ev", "__ZN6test143SubD0Ev", "_cFunc"
+          ],
+          "weak": [
+            "__ZN9templates3BarIiEC2Ev", "__ZN9templates3BarIiEC1Ev",
+            "__ZN9templates3BarIiED1Ev", "__ZN6test126SimpleIiE3fooEv",
+            "__ZN9templates4foo1IsEEiT_", "__ZN9templates3BarIiED2Ev",
+            "__ZN9templates3BarIiE7bazingaEv"
+          ]
+        }
+      }
+    ],
+    "flags": [
+      {
+        "attributes": [
+          "not_app_extension_safe"
+        ]
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/lib/libcpp.dylib"
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
+
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index 05a82770e26de..4909ab034ef30 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -44,6 +44,7 @@ CHECK-NEXT:      -Wreorder-ctor
 CHECK-NEXT:      -Wreorder-init-list
 CHECK-NEXT:    -Wreturn-type
 CHECK-NEXT:      -Wreturn-type-c-linkage
+CHECK-NEXT:      -Wreturn-mismatch
 CHECK-NEXT:    -Wself-assign
 CHECK-NEXT:      -Wself-assign-overloaded
 CHECK-NEXT:      -Wself-assign-field
diff --git a/clang/test/Sema/return-type-mismatch.c b/clang/test/Sema/return-type-mismatch.c
new file mode 100644
index 0000000000000..79a625d7df1f5
--- /dev/null
+++ b/clang/test/Sema/return-type-mismatch.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -Wreturn-type -Wno-return-mismatch -fsyntax-only -verify=return-type %s
+// RUN: %clang_cc1 -Wno-return-type -Wreturn-mismatch -fsyntax-only -verify=return-mismatch %s
+
+int foo(void) __attribute__((noreturn));
+int bar(void);
+
+void test1(void) {
+  return 1; // return-mismatch-warning{{void function 'test1' should not return a value}}
+}
+
+int test2(void) { 
+    return; // return-mismatch-warning{{non-void function 'test2' should return a value}}
+} 
+
+int test3(void) { 
+    // return-type-warning@+1 {{non-void function does not return a value}}
+} 
+
+int test4(void) {
+    (void)(bar() || foo()); // return-type-warning@+1 {{non-void function does not return a value in all control paths}}
+} 
+
+void test5(void) {
+} // no-warning
+
+int test6(void) {
+  return 0; // no-warning
+}
+
+int test7(void) {
+  foo(); // no warning
+}
+
+int test8(void) {
+  bar(); // return-type-warning@+1 {{non-void function does not return a value}}
+}
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index b9c36eab2ad3b..701ab81c57c3d 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -99,6 +99,33 @@ bool Options::processLinkerOptions(InputArgList &Args) {
   return true;
 }
 
+bool Options::processFrontendOptions(InputArgList &Args) {
+  // Do not claim any arguments, as they will be passed along for CC1
+  // invocations.
+  if (auto *A = Args.getLastArgNoClaim(OPT_x)) {
+    FEOpts.LangMode = llvm::StringSwitch<clang::Language>(A->getValue())
+                          .Case("c", clang::Language::C)
+                          .Case("c++", clang::Language::CXX)
+                          .Case("objective-c", clang::Language::ObjC)
+                          .Case("objective-c++", clang::Language::ObjCXX)
+                          .Default(clang::Language::Unknown);
+
+    if (FEOpts.LangMode == clang::Language::Unknown) {
+      Diags->Report(clang::diag::err_drv_invalid_value)
+          << A->getAsString(Args) << A->getValue();
+      return false;
+    }
+  }
+  for (auto *A : Args.filtered(OPT_ObjC, OPT_ObjCXX)) {
+    if (A->getOption().matches(OPT_ObjC))
+      FEOpts.LangMode = clang::Language::ObjC;
+    else
+      FEOpts.LangMode = clang::Language::ObjCXX;
+  }
+
+  return true;
+}
+
 Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
                  InputArgList &ArgList)
     : Diags(&Diag), FM(FM) {
@@ -108,7 +135,10 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
   if (!processLinkerOptions(ArgList))
     return;
 
-  /// Any remaining arguments should be handled by invoking the clang frontend.
+  if (!processFrontendOptions(ArgList))
+    return;
+
+  /// Any unclaimed arguments should be handled by invoking the clang frontend.
   for (const Arg *A : ArgList) {
     if (A->isClaimed())
       continue;
@@ -132,6 +162,7 @@ InstallAPIContext Options::createContext() {
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
+  Ctx.LangMode = FEOpts.LangMode;
 
   // Process inputs.
   for (const std::string &ListPath : DriverOpts.FileLists) {
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index f68addf197288..9d4d841284fd1 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -62,15 +62,22 @@ struct LinkerOptions {
   bool IsDylib = false;
 };
 
+struct FrontendOptions {
+  /// \brief The language mode to parse headers in.
+  Language LangMode = Language::ObjC;
+};
+
 class Options {
 private:
   bool processDriverOptions(llvm::opt::InputArgList &Args);
   bool processLinkerOptions(llvm::opt::InputArgList &Args);
+  bool processFrontendOptions(llvm::opt::InputArgList &Args);
 
 public:
   /// The various options grouped together.
   DriverOptions DriverOpts;
   LinkerOptions LinkerOpts;
+  FrontendOptions FEOpts;
 
   Options() = delete;
 
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
new file mode 100644
index 0000000000000..5e28283fbc1c6
--- /dev/null
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -0,0 +1,15 @@
+# The LLVM Version number information
+
+if(NOT DEFINED LLVM_VERSION_MAJOR)
+  set(LLVM_VERSION_MAJOR 19)
+endif()
+if(NOT DEFINED LLVM_VERSION_MINOR)
+  set(LLVM_VERSION_MINOR 0)
+endif()
+if(NOT DEFINED LLVM_VERSION_PATCH)
+  set(LLVM_VERSION_PATCH 0)
+endif()
+if(NOT DEFINED LLVM_VERSION_SUFFIX)
+  set(LLVM_VERSION_SUFFIX git)
+endif()
+
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index d10222b7530a8..33c97b1ac28af 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -1,4 +1,5 @@
 include(BuiltinTests)
+include(CheckIncludeFiles)
 include(CheckCSourceCompiles)
 
 # Make all the tests only check the compiler
@@ -43,6 +44,8 @@ void foo(void)  __arm_streaming_compatible {
 }
 ")
 
+check_include_files("sys/auxv.h"    COMPILER_RT_HAS_AUXV)
+
 if(ANDROID)
   set(OS_NAME "Android")
 else()
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 069b5f64475db..9e1fd6d6dca3c 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1081,6 +1081,11 @@ class Allocator {
     // An array of Size (at least one) elements of type Entry is immediately
     // following to this struct.
   };
+  static_assert(sizeof(AllocationRingBuffer) %
+                        alignof(typename AllocationRingBuffer::Entry) ==
+                    0,
+                "invalid alignment");
+
   // Pointer to memory mapped area starting with AllocationRingBuffer struct,
   // and immediately followed by Size elements of type Entry.
   atomic_uptr RingBufferAddress = {};
@@ -1553,16 +1558,6 @@ class Allocator {
     constexpr u32 kFramesPerStack = 16;
     static_assert(isPowerOfTwo(kFramesPerStack));
 
-    // We need StackDepot to be aligned to 8-bytes so the ring we store after
-    // is correctly assigned.
-    static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
-
-    // Make sure the maximum sized StackDepot fits withint a uintptr_t to
-    // simplify the overflow checking.
-    static_assert(sizeof(StackDepot) + UINT32_MAX * sizeof(atomic_u64) *
-                                           UINT32_MAX * sizeof(atomic_u32) <
-                  UINTPTR_MAX);
-
     if (AllocationRingBufferSize > kMaxU32Pow2 / kStacksPerRingBufferEntry)
       return;
     u32 TabSize = static_cast<u32>(roundUpPowerOfTwo(kStacksPerRingBufferEntry *
@@ -1595,10 +1590,6 @@ class Allocator {
 
     atomic_store(&RingBufferAddress, reinterpret_cast<uptr>(RB),
                  memory_order_release);
-    static_assert(sizeof(AllocationRingBuffer) %
-                          alignof(typename AllocationRingBuffer::Entry) ==
-                      0,
-                  "invalid alignment");
   }
 
   void unmapRingBuffer() {
diff --git a/compiler-rt/lib/scudo/standalone/stack_depot.h b/compiler-rt/lib/scudo/standalone/stack_depot.h
index 620137e44f372..cf3cabf7085b6 100644
--- a/compiler-rt/lib/scudo/standalone/stack_depot.h
+++ b/compiler-rt/lib/scudo/standalone/stack_depot.h
@@ -199,6 +199,10 @@ class alignas(atomic_u64) StackDepot {
   void enable() NO_THREAD_SAFETY_ANALYSIS { RingEndMu.unlock(); }
 };
 
+// We need StackDepot to be aligned to 8-bytes so the ring we store after
+// is correctly assigned.
+static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
+
 } // namespace scudo
 
 #endif // SCUDO_STACK_DEPOT_H_
diff --git a/flang/include/flang/Evaluate/integer.h b/flang/include/flang/Evaluate/integer.h
index 977d35c7eecf4..7395645701265 100644
--- a/flang/include/flang/Evaluate/integer.h
+++ b/flang/include/flang/Evaluate/integer.h
@@ -27,6 +27,10 @@
 #include <string>
 #include <type_traits>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace Fortran::evaluate::value {
 
 // Implements an integer as an assembly of smaller host integer parts
@@ -150,7 +154,10 @@ class Integer {
           }
         }
       } else {
-        INT signExtension{-(n < 0)};
+        // Avoid left shifts of negative signed values (that's an undefined
+        // behavior in C++).
+        auto signExtension{std::make_unsigned_t<INT>(n < 0)};
+        signExtension = ~signExtension + 1;
         static_assert(nBits >= partBits);
         if constexpr (nBits > partBits) {
           signExtension <<= nBits - partBits;
@@ -474,7 +481,12 @@ class Integer {
     SINT n = ToUInt<UINT>();
     constexpr std::size_t maxBits{CHAR_BIT * sizeof n};
     if constexpr (bits < maxBits) {
-      n |= -(n >> (bits - 1)) << bits;
+      // Avoid left shifts of negative signed values (that's an undefined
+      // behavior in C++).
+      auto u{std::make_unsigned_t<SINT>(ToUInt())};
+      u = (u >> (bits - 1)) << (bits - 1); // Get the sign bit only.
+      u = ~u + 1; // Negate top bits if not 0.
+      n |= static_cast<SINT>(u);
     }
     return n;
   }
diff --git a/flang/include/flang/Evaluate/real.h b/flang/include/flang/Evaluate/real.h
index 5266bd0ef64bf..d0da9634651f3 100644
--- a/flang/include/flang/Evaluate/real.h
+++ b/flang/include/flang/Evaluate/real.h
@@ -18,6 +18,10 @@
 #include <limits>
 #include <string>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace llvm {
 class raw_ostream;
 }
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 7cb99d61a686e..ca15b4bc34b29 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -338,6 +338,7 @@ struct IntrinsicLibrary {
   mlir::Value genSign(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genSize(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genSizeOf(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genSpacing(mlir::Type resultType,
                          llvm::ArrayRef<mlir::Value> args);
   fir::ExtendedValue genSpread(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index fe9864a26295d..f00993d4d3778 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -1,6 +1,10 @@
 # This replicates part of the add_mlir_dialect cmake function from MLIR that
 # cannot be used her because it expects to be run inside MLIR directory which
 # is not the case for FIR.
+set(LLVM_TARGET_DEFINITIONS FIRDialect.td)
+mlir_tablegen(FIRDialect.h.inc -gen-dialect-decls -dialect=fir)
+mlir_tablegen(FIRDialect.cpp.inc -gen-dialect-defs -dialect=fir)
+
 set(LLVM_TARGET_DEFINITIONS FIRAttr.td)
 mlir_tablegen(FIREnumAttr.h.inc -gen-enum-decls)
 mlir_tablegen(FIREnumAttr.cpp.inc -gen-enum-defs)
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 66d6cd471116b..2ac4af9e66aa8 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -16,7 +16,7 @@
 include "flang/Optimizer/Dialect/FIRDialect.td"
 include "mlir/IR/EnumAttr.td"
 
-class fir_Attr<string name> : AttrDef<fir_Dialect, name>;
+class fir_Attr<string name> : AttrDef<FIROpsDialect, name>;
 
 def FIRnoAttributes  : I32BitEnumAttrCaseNone<"None">;
 def FIRallocatable  : I32BitEnumAttrCaseBit<"allocatable", 0>;
@@ -91,7 +91,7 @@ def fir_CUDADataAttribute : I32EnumAttr<
 }
 
 def fir_CUDADataAttributeAttr :
-    EnumAttr<fir_Dialect, fir_CUDADataAttribute, "cuda"> {
+    EnumAttr<FIROpsDialect, fir_CUDADataAttribute, "cuda"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
@@ -109,7 +109,7 @@ def fir_CUDAProcAttribute : I32EnumAttr<
 }
 
 def fir_CUDAProcAttributeAttr :
-    EnumAttr<fir_Dialect, fir_CUDAProcAttribute, "cuda_proc"> {
+    EnumAttr<FIROpsDialect, fir_CUDAProcAttribute, "cuda_proc"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.h b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
index 238385505dbff..ed7c98ec82e2d 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRDialect.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.h
@@ -15,43 +15,14 @@
 
 #include "mlir/IR/Dialect.h"
 
+#include "flang/Optimizer/Dialect/FIRDialect.h.inc"
+
 namespace mlir {
 class IRMapping;
 } // namespace mlir
 
 namespace fir {
 
-/// FIR dialect
-class FIROpsDialect final : public mlir::Dialect {
-public:
-  explicit FIROpsDialect(mlir::MLIRContext *ctx);
-  virtual ~FIROpsDialect();
-
-  static llvm::StringRef getDialectNamespace() { return "fir"; }
-
-  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
-  void printType(mlir::Type ty, mlir::DialectAsmPrinter &p) const override;
-
-  mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
-                                 mlir::Type type) const override;
-  void printAttribute(mlir::Attribute attr,
-                      mlir::DialectAsmPrinter &p) const override;
-
-  /// Return string name of fir.runtime attribute.
-  static constexpr llvm::StringRef getFirRuntimeAttrName() {
-    return "fir.runtime";
-  }
-
-private:
-  // Register the Attributes of this dialect.
-  void registerAttributes();
-  // Register the Types of this dialect.
-  void registerTypes();
-  // Register external interfaces on operations of
-  // this dialect.
-  void registerOpExternalInterfaces();
-};
-
 /// The FIR codegen dialect is a dialect containing a small set of transient
 /// operations used exclusively during code generation.
 class FIRCodeGenDialect final : public mlir::Dialect {
diff --git a/flang/include/flang/Optimizer/Dialect/FIRDialect.td b/flang/include/flang/Optimizer/Dialect/FIRDialect.td
index b366b6d40e4e2..0dfb3eda585ce 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRDialect.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRDialect.td
@@ -21,7 +21,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
-def fir_Dialect : Dialect {
+def FIROpsDialect : Dialect {
   let name = "fir";
   let cppNamespace = "::fir";
   let useDefaultTypePrinterParser = 0;
@@ -30,10 +30,33 @@ def fir_Dialect : Dialect {
   let dependentDialects = [
     // Arith dialect provides FastMathFlagsAttr
     // supported by some FIR operations.
-    "arith::ArithDialect",
+    "mlir::arith::ArithDialect",
     // TBAA Tag types
-    "LLVM::LLVMDialect"
+    "mlir::LLVM::LLVMDialect"
   ];
+  let extraClassDeclaration = [{
+  private:
+    // Register the builtin Attributes.
+    void registerAttributes();
+    // Register the builtin Types.
+    void registerTypes();
+    // Register external interfaces on operations of
+    // this dialect.
+    void registerOpExternalInterfaces();
+  public:
+    mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+    void printType(mlir::Type ty, mlir::DialectAsmPrinter &p) const override;
+ 
+    mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser,
+                                   mlir::Type type) const override;
+    void printAttribute(mlir::Attribute attr,
+                        mlir::DialectAsmPrinter &p) const override;
+
+    // Return string name of fir.runtime attribute.
+    static constexpr llvm::StringRef getFirRuntimeAttrName() {
+      return "fir.runtime";
+    }
+  }];
 }
 
 #endif // FORTRAN_DIALECT_FIR_DIALECT
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index db5e5f4bc682e..65a86d25333b5 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -27,7 +27,7 @@ include "mlir/IR/BuiltinAttributes.td"
 // Base class for FIR operations.
 // All operations automatically get a prefix of "fir.".
 class fir_Op<string mnemonic, list<Trait> traits>
-  : Op<fir_Dialect, mnemonic, traits>;
+  : Op<FIROpsDialect, mnemonic, traits>;
 
 // Base class for FIR operations that take a single argument
 class fir_SimpleOp<string mnemonic, list<Trait> traits>
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 2a2f50720859e..4c6a8064991ab 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -22,7 +22,7 @@ include "flang/Optimizer/Dialect/FIRDialect.td"
 
 class FIR_Type<string name, string typeMnemonic, list<Trait> traits = [],
                string baseCppClass = "::mlir::Type">
-    : TypeDef<fir_Dialect, name, traits, baseCppClass> {
+    : TypeDef<FIROpsDialect, name, traits, baseCppClass> {
   let mnemonic = typeMnemonic;
 }
 
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index c82eae154d31a..743a6c98ec1a0 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -1358,7 +1358,9 @@ def hlfir_YieldOp : hlfir_Op<"yield", [Terminator, ParentOneOf<["RegionAssignOp"
   let assemblyFormat = "$entity attr-dict `:` type($entity) custom<YieldOpCleanup>($cleanup)";
 }
 
-def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"RegionAssignOp">, RecursiveMemoryEffects, RecursivelySpeculatable, hlfir_ElementalOpInterface]> {
+def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"RegionAssignOp">,
+    RecursiveMemoryEffects, RecursivelySpeculatable, hlfir_ElementalOpInterface,
+    AttrSizedOperandSegments]> {
   let summary = "Yield the address of a vector subscripted variable inside an hlfir.region_assign";
   let description = [{
     Special terminator node for the left-hand side region of an hlfir.region_assign
@@ -1398,6 +1400,7 @@ def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"R
 
   let arguments = (ins
     fir_ShapeType:$shape,
+    Optional<AnyPolymorphicObject>:$mold,
     Variadic<AnyIntegerType>:$typeparams,
     OptionalAttr<UnitAttr>:$unordered
   );
@@ -1406,11 +1409,15 @@ def hlfir_ElementalAddrOp : hlfir_Op<"elemental_addr", [Terminator, HasParent<"R
                          MaxSizedRegion<1>:$cleanup);
 
   let builders = [
-    OpBuilder<(ins "mlir::Value":$shape, CArg<"bool", "false">:$isUnordered)>
+    OpBuilder<(ins "mlir::Value":$shape,
+          CArg<"mlir::Value", "{}">:$mold,
+      CArg<"mlir::ValueRange", "{}">:$typeparams,
+      CArg<"bool", "false">:$isUnordered)>
   ];
 
   let assemblyFormat = [{
-    $shape (`typeparams` $typeparams^)? (`unordered` $unordered^)?
+    $shape (`mold` $mold^)? (`typeparams` $typeparams^)?
+    (`unordered` $unordered^)?
     attr-dict `:` type(operands) $body
     custom<YieldOpCleanup>($cleanup)}];
 
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 6b3c9416724cb..9dd8c3843465d 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -39,6 +39,10 @@
 #include <type_traits>
 #include <variant>
 
+// Some environments, viz. glibc 2.17, allow the macro HUGE
+// to leak out of <math.h>.
+#undef HUGE
+
 namespace Fortran::evaluate {
 
 // Utilities
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 731c5072c45c5..c5bfbdf6b8c11 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -761,9 +761,17 @@ class HlfirDesignatorBuilder {
     // of the whole designator (not the ones of the vector subscripted part).
     // These are not yet known and will be added when finalizing the designator
     // lowering.
-    auto elementalAddrOp =
-        builder.create<hlfir::ElementalAddrOp>(loc, shape,
-                                               /*isUnordered=*/true);
+    // The resulting designator may be polymorphic, in which case the resulting
+    // type is the base of the vector subscripted part because
+    // allocatable/pointer components cannot be referenced after a vector
+    // subscripted part. Set the mold to the current base. It will be erased if
+    // the resulting designator is not polymorphic.
+    assert(partInfo.base.has_value() &&
+           "vector subscripted part must have a base");
+    mlir::Value mold = *partInfo.base;
+    auto elementalAddrOp = builder.create<hlfir::ElementalAddrOp>(
+        loc, shape, mold, mlir::ValueRange{},
+        /*isUnordered=*/true);
     setVectorSubscriptElementAddrOp(elementalAddrOp);
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
     mlir::Region::BlockArgListType indices = elementalAddrOp.getIndices();
@@ -804,15 +812,8 @@ class HlfirDesignatorBuilder {
                              hlfir::EntityWithAttributes elementAddr) {
     fir::FirOpBuilder &builder = getBuilder();
     builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
-    // For polymorphic entities, it will be needed to add a mold on the
-    // hlfir.elemental so that we are able to create temporary storage
-    // for it using the dynamic type. It seems that a reference to the mold
-    // entity can be created by evaluating the hlfir.elemental_addr
-    // for a single index. The evaluation should be legal as long as
-    // the hlfir.elemental_addr has no side effects, otherwise,
-    // it is not clear how to get the mold reference.
-    if (elementAddr.isPolymorphic())
-      TODO(loc, "vector subscripted polymorphic entity in HLFIR");
+    if (!elementAddr.isPolymorphic())
+      elementalAddrOp.getMoldMutable().clear();
     builder.create<hlfir::YieldOp>(loc, elementAddr);
     builder.setInsertionPointAfter(elementalAddrOp);
   }
@@ -929,6 +930,8 @@ HlfirDesignatorBuilder::convertVectorSubscriptedExprToElementalAddr(
   hlfir::genLengthParameters(loc, builder, elementAddrEntity, lengths);
   if (!lengths.empty())
     elementalAddrOp.getTypeparamsMutable().assign(lengths);
+  if (!elementAddrEntity.isPolymorphic())
+    elementalAddrOp.getMoldMutable().clear();
   // Create the hlfir.yield terminator inside the hlfir.elemental_body.
   builder.setInsertionPointToEnd(&elementalAddrOp.getBody().front());
   builder.create<hlfir::YieldOp>(loc, elementAddrEntity);
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 0e0b14e8d6909..c7a550814e1d5 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1036,9 +1036,9 @@ hlfir::cloneToElementalOp(mlir::Location loc, fir::FirOpBuilder &builder,
     return hlfir::loadTrivialScalar(l, b, newAddr);
   };
   mlir::Type elementType = scalarAddress.getFortranElementType();
-  return hlfir::genElementalOp(loc, builder, elementType,
-                               elementalAddrOp.getShape(), typeParams,
-                               genKernel, !elementalAddrOp.isOrdered());
+  return hlfir::genElementalOp(
+      loc, builder, elementType, elementalAddrOp.getShape(), typeParams,
+      genKernel, !elementalAddrOp.isOrdered(), elementalAddrOp.getMold());
 }
 
 bool hlfir::elementalOpMustProduceTemp(hlfir::ElementalOp elemental) {
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 2f7ace658e475..ca5ab6fcea342 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -567,6 +567,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asAddr, handleDynamicOptional},
        {"kind", asValue}}},
      /*isElemental=*/false},
+    {"sizeof",
+     &I::genSizeOf,
+     {{{"a", asBox}}},
+     /*isElemental=*/false},
     {"sleep", &I::genSleep, {{{"seconds", asValue}}}, /*isElemental=*/false},
     {"spacing", &I::genSpacing},
     {"spread",
@@ -5946,6 +5950,20 @@ IntrinsicLibrary::genSize(mlir::Type resultType,
       .getResults()[0];
 }
 
+// SIZEOF
+fir::ExtendedValue
+IntrinsicLibrary::genSizeOf(mlir::Type resultType,
+                            llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Value box = fir::getBase(args[0]);
+  mlir::Value eleSize = builder.create<fir::BoxEleSizeOp>(loc, resultType, box);
+  if (!fir::isArray(args[0]))
+    return eleSize;
+  mlir::Value arraySize = builder.createConvert(
+      loc, resultType, fir::runtime::genSize(builder, loc, box));
+  return builder.create<mlir::arith::MulIOp>(loc, eleSize, arraySize);
+}
+
 // TAND
 mlir::Value IntrinsicLibrary::genTand(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
diff --git a/flang/lib/Optimizer/Dialect/FIRDialect.cpp b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
index 850b6120b2a00..4d1e8cd1405af 100644
--- a/flang/lib/Optimizer/Dialect/FIRDialect.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
@@ -18,6 +18,8 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/InliningUtils.h"
 
+#include "flang/Optimizer/Dialect/FIRDialect.cpp.inc"
+
 using namespace fir;
 
 namespace {
@@ -58,9 +60,7 @@ struct FIRInlinerInterface : public mlir::DialectInlinerInterface {
 };
 } // namespace
 
-fir::FIROpsDialect::FIROpsDialect(mlir::MLIRContext *ctx)
-    : mlir::Dialect("fir", ctx, mlir::TypeID::get<FIROpsDialect>()) {
-  getContext()->loadDialect<mlir::LLVM::LLVMDialect>();
+void fir::FIROpsDialect::initialize() {
   registerTypes();
   registerAttributes();
   addOperations<
@@ -94,11 +94,6 @@ void fir::addFIRToLLVMIRExtension(mlir::DialectRegistry &registry) {
       });
 }
 
-// anchor the class vtable to this compilation unit
-fir::FIROpsDialect::~FIROpsDialect() {
-  // do nothing
-}
-
 mlir::Type fir::FIROpsDialect::parseType(mlir::DialectAsmParser &parser) const {
   return parseFirType(const_cast<FIROpsDialect *>(this), parser);
 }
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 3568fe202caf1..8bad4e445082d 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1406,33 +1406,45 @@ void hlfir::AsExprOp::getEffects(
 // ElementalOp
 //===----------------------------------------------------------------------===//
 
-void hlfir::ElementalOp::build(mlir::OpBuilder &builder,
-                               mlir::OperationState &odsState,
-                               mlir::Type resultType, mlir::Value shape,
-                               mlir::Value mold, mlir::ValueRange typeparams,
-                               bool isUnordered) {
+/// Common builder for ElementalOp and ElementalAddrOp to add the arguments and
+/// create the elemental body. Result and clean-up body must be handled in
+/// specific builders.
+template <typename Op>
+static void buildElemental(mlir::OpBuilder &builder,
+                           mlir::OperationState &odsState, mlir::Value shape,
+                           mlir::Value mold, mlir::ValueRange typeparams,
+                           bool isUnordered) {
   odsState.addOperands(shape);
   if (mold)
     odsState.addOperands(mold);
   odsState.addOperands(typeparams);
-  odsState.addTypes(resultType);
   odsState.addAttribute(
-      getOperandSegmentSizesAttrName(odsState.name),
+      Op::getOperandSegmentSizesAttrName(odsState.name),
       builder.getDenseI32ArrayAttr({/*shape=*/1, (mold ? 1 : 0),
                                     static_cast<int32_t>(typeparams.size())}));
   if (isUnordered)
-    odsState.addAttribute(getUnorderedAttrName(odsState.name),
+    odsState.addAttribute(Op::getUnorderedAttrName(odsState.name),
                           isUnordered ? builder.getUnitAttr() : nullptr);
   mlir::Region *bodyRegion = odsState.addRegion();
   bodyRegion->push_back(new mlir::Block{});
-  if (auto exprType = resultType.dyn_cast<hlfir::ExprType>()) {
-    unsigned dim = exprType.getRank();
+  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
+    unsigned dim = shapeType.getRank();
     mlir::Type indexType = builder.getIndexType();
     for (unsigned d = 0; d < dim; ++d)
       bodyRegion->front().addArgument(indexType, odsState.location);
   }
 }
 
+void hlfir::ElementalOp::build(mlir::OpBuilder &builder,
+                               mlir::OperationState &odsState,
+                               mlir::Type resultType, mlir::Value shape,
+                               mlir::Value mold, mlir::ValueRange typeparams,
+                               bool isUnordered) {
+  odsState.addTypes(resultType);
+  buildElemental<hlfir::ElementalOp>(builder, odsState, shape, mold, typeparams,
+                                     isUnordered);
+}
+
 mlir::Value hlfir::ElementalOp::getElementEntity() {
   return mlir::cast<hlfir::YieldElementOp>(getBody()->back()).getElementValue();
 }
@@ -1681,19 +1693,11 @@ static void printYieldOpCleanup(mlir::OpAsmPrinter &p, YieldOp yieldOp,
 
 void hlfir::ElementalAddrOp::build(mlir::OpBuilder &builder,
                                    mlir::OperationState &odsState,
-                                   mlir::Value shape, bool isUnordered) {
-  odsState.addOperands(shape);
-  if (isUnordered)
-    odsState.addAttribute(getUnorderedAttrName(odsState.name),
-                          isUnordered ? builder.getUnitAttr() : nullptr);
-  mlir::Region *bodyRegion = odsState.addRegion();
-  bodyRegion->push_back(new mlir::Block{});
-  if (auto shapeType = shape.getType().dyn_cast<fir::ShapeType>()) {
-    unsigned dim = shapeType.getRank();
-    mlir::Type indexType = builder.getIndexType();
-    for (unsigned d = 0; d < dim; ++d)
-      bodyRegion->front().addArgument(indexType, odsState.location);
-  }
+                                   mlir::Value shape, mlir::Value mold,
+                                   mlir::ValueRange typeparams,
+                                   bool isUnordered) {
+  buildElemental<hlfir::ElementalAddrOp>(builder, odsState, shape, mold,
+                                         typeparams, isUnordered);
   // Push cleanUp region.
   odsState.addRegion();
 }
diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp
index c5a630c471d16..799d13a423660 100644
--- a/flang/lib/Parser/token-sequence.cpp
+++ b/flang/lib/Parser/token-sequence.cpp
@@ -136,7 +136,10 @@ void TokenSequence::Put(
 }
 
 void TokenSequence::Put(const CharBlock &t, Provenance provenance) {
-  Put(&t[0], t.size(), provenance);
+  // Avoid t[0] if t is empty: it would create a reference to nullptr,
+  // which is UB.
+  const char *addr{t.size() ? &t[0] : nullptr};
+  Put(addr, t.size(), provenance);
 }
 
 void TokenSequence::Put(const std::string &s, Provenance provenance) {
diff --git a/flang/runtime/command.cpp b/flang/runtime/command.cpp
index 7c44890545bd3..fabfe601688bb 100644
--- a/flang/runtime/command.cpp
+++ b/flang/runtime/command.cpp
@@ -196,11 +196,11 @@ std::int32_t RTNAME(GetCommand)(const Descriptor *value,
 }
 
 static std::size_t LengthWithoutTrailingSpaces(const Descriptor &d) {
-  std::size_t s{d.ElementBytes() - 1};
-  while (*d.OffsetElement(s) == ' ') {
+  std::size_t s{d.ElementBytes()}; // This can be 0.
+  while (s != 0 && *d.OffsetElement(s - 1) == ' ') {
     --s;
   }
-  return s + 1;
+  return s;
 }
 
 std::int32_t RTNAME(GetEnvVariable)(const Descriptor &name,
diff --git a/flang/test/HLFIR/element-addr.fir b/flang/test/HLFIR/element-addr.fir
index 73946f8b40e3d..c3c48edd9b563 100644
--- a/flang/test/HLFIR/element-addr.fir
+++ b/flang/test/HLFIR/element-addr.fir
@@ -114,3 +114,45 @@ func.func @unordered() {
 // CHECK:           }
 // CHECK:           return
 // CHECK:         }
+
+// "X(VECTOR) = Y" with polymorphic X and Y and user defined assignment.
+func.func @test_mold(%x: !fir.class<!fir.array<?x!fir.type<t>>>, %y: !fir.class<!fir.array<?x!fir.type<t>>>, %vector: !fir.box<!fir.array<?xi64>>) {
+  hlfir.region_assign {
+    hlfir.yield %y : !fir.class<!fir.array<?x!fir.type<t>>>
+  } to {
+    %c0 = arith.constant 0 : index
+    %0:3 = fir.box_dims %vector, %c0 : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+    %1 = fir.shape %0#1 : (index) -> !fir.shape<1>
+    hlfir.elemental_addr %1 mold %x unordered : !fir.shape<1>, !fir.class<!fir.array<?x!fir.type<t>>> {
+    ^bb0(%arg3: index):
+      %2 = hlfir.designate %vector (%arg3)  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+      %3 = fir.load %2 : !fir.ref<i64>
+      %4 = hlfir.designate %x (%3)  : (!fir.class<!fir.array<?x!fir.type<t>>>, i64) -> !fir.class<!fir.type<t>>
+      hlfir.yield %4 : !fir.class<!fir.type<t>>
+    }
+  } user_defined_assign  (%arg3: !fir.class<!fir.type<t>>) to (%arg4: !fir.class<!fir.type<t>>) {
+    fir.call @user_def_assign(%arg4, %arg3) : (!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>) -> ()
+  }
+  return
+}
+func.func private @user_def_assign(!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>)
+// CHECK-LABEL: func.func @test_mold(
+// CHECK-SAME:                       %[[VAL_0:[^:]*]]: !fir.class<!fir.array<?x!fir.type<t>>>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: !fir.class<!fir.array<?x!fir.type<t>>>,
+// CHECK-SAME:                       %[[VAL_2:.*]]: !fir.box<!fir.array<?xi64>>) {
+// CHECK:         hlfir.region_assign {
+// CHECK:           hlfir.yield %[[VAL_1]] : !fir.class<!fir.array<?x!fir.type<t>>>
+// CHECK:         } to {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_4:.*]]:3 = fir.box_dims %[[VAL_2]], %[[VAL_3]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_5:.*]] = fir.shape %[[VAL_4]]#1 : (index) -> !fir.shape<1>
+// CHECK:           hlfir.elemental_addr %[[VAL_5]] mold %[[VAL_0]] unordered : !fir.shape<1>, !fir.class<!fir.array<?x!fir.type<t>>> {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: index):
+// CHECK:             %[[VAL_7:.*]] = hlfir.designate %[[VAL_2]] (%[[VAL_6]])  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+// CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_7]] : !fir.ref<i64>
+// CHECK:             %[[VAL_9:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_8]])  : (!fir.class<!fir.array<?x!fir.type<t>>>, i64) -> !fir.class<!fir.type<t>>
+// CHECK:             hlfir.yield %[[VAL_9]] : !fir.class<!fir.type<t>>
+// CHECK:           }
+// CHECK:         } user_defined_assign  (%[[VAL_10:.*]]: !fir.class<!fir.type<t>>) to (%[[VAL_11:.*]]: !fir.class<!fir.type<t>>) {
+// CHECK:           fir.call @user_def_assign(%[[VAL_11]], %[[VAL_10]]) : (!fir.class<!fir.type<t>>, !fir.class<!fir.type<t>>) -> ()
+// CHECK:         }
diff --git a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90 b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
index 2f463cfaa8b07..d4026a37720f7 100644
--- a/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
+++ b/flang/test/Lower/HLFIR/vector-subscript-as-value.f90
@@ -1,6 +1,6 @@
 ! Test lowering of vector subscript designators outside of the
 ! assignment left-and side and input IO context.
-! RUN: bbc -emit-hlfir -o - -I nw %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -o - -I nw %s --polymorphic-type 2>&1 | FileCheck %s
 
 subroutine foo(x, y)
   integer :: x(100)
@@ -182,3 +182,37 @@ subroutine substring(c, vector, i, j)
 ! CHECK:    %[[VAL_27:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_26]]) substr %[[VAL_15]], %[[VAL_16]]  typeparams %[[VAL_22]] : (!fir.box<!fir.array<?x!fir.char<1,?>>>, i64, index, index, index) -> !fir.boxchar<1>
 ! CHECK:    hlfir.yield_element %[[VAL_27]] : !fir.boxchar<1>
 ! CHECK:  }
+
+subroutine test_passing_subscripted_poly(x, vector)
+  interface
+    subroutine do_something(x)
+      class(*) :: x(:)
+    end subroutine
+  end interface
+  class(*) :: x(:, :)
+  integer(8) :: vector(:)
+  call do_something(x(314, vector))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_passing_subscripted_poly(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.class<!fir.array<?x?xnone>>
+! CHECK-SAME:                                                %[[VAL_1:.*]]: !fir.box<!fir.array<?xi64>>
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_passing_subscripted_polyEvector"} : (!fir.box<!fir.array<?xi64>>) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_passing_subscripted_polyEx"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 314 : index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_2]]#0, %[[VAL_5]] : (!fir.box<!fir.array<?xi64>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] mold %[[VAL_3]]#0 unordered : (!fir.shape<1>, !fir.class<!fir.array<?x?xnone>>) -> !hlfir.expr<?xnone?> {
+! CHECK:           ^bb0(%[[VAL_9:.*]]: index):
+! CHECK:             %[[VAL_10:.*]] = hlfir.designate %[[VAL_2]]#0 (%[[VAL_9]])  : (!fir.box<!fir.array<?xi64>>, index) -> !fir.ref<i64>
+! CHECK:             %[[VAL_11:.*]] = fir.load %[[VAL_10]] : !fir.ref<i64>
+! CHECK:             %[[VAL_12:.*]] = hlfir.designate %[[VAL_3]]#0 (%[[VAL_4]], %[[VAL_11]])  : (!fir.class<!fir.array<?x?xnone>>, index, i64) -> !fir.class<none>
+! CHECK:             hlfir.yield_element %[[VAL_12]] : !fir.class<none>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]]:3 = hlfir.associate %[[VAL_8]](%[[VAL_7]]) {adapt.valuebyref} : (!hlfir.expr<?xnone?>, !fir.shape<1>) -> (!fir.class<!fir.heap<!fir.array<?xnone>>>, !fir.class<!fir.heap<!fir.array<?xnone>>>, i1)
+! CHECK:           %[[VAL_14:.*]] = fir.rebox %[[VAL_13]]#0 : (!fir.class<!fir.heap<!fir.array<?xnone>>>) -> !fir.class<!fir.array<?xnone>>
+! CHECK:           fir.call @_QPdo_something(%[[VAL_14]]) fastmath<contract> : (!fir.class<!fir.array<?xnone>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_13]]#0, %[[VAL_13]]#2 : !fir.class<!fir.heap<!fir.array<?xnone>>>, i1
+! CHECK:           hlfir.destroy %[[VAL_8]] : !hlfir.expr<?xnone?>
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/Intrinsics/sizeof.f90 b/flang/test/Lower/Intrinsics/sizeof.f90
new file mode 100644
index 0000000000000..959ca1692b514
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sizeof.f90
@@ -0,0 +1,23 @@
+! Test SIZEOF lowering for polymorphic entities.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s | FileCheck %s
+
+integer(8) function test1(x)
+  class(*) :: x
+  test1 = sizeof(x)
+end function
+! CHECK-LABEL:   func.func @_QPtest1(
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest1Ex"} : (!fir.class<none>) -> (!fir.class<none>, !fir.class<none>)
+! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<none>) -> i64
+! CHECK:           hlfir.assign %[[VAL_4]] to %{{.*}} : i64, !fir.ref<i64>
+
+integer(8) function test2(x)
+  class(*) :: x(:, :)
+  test2 = sizeof(x)
+end function
+! CHECK-LABEL:   func.func @_QPtest2(
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest2Ex"} : (!fir.class<!fir.array<?x?xnone>>) -> (!fir.class<!fir.array<?x?xnone>>, !fir.class<!fir.array<?x?xnone>>)
+! CHECK:           %[[VAL_4:.*]] = fir.box_elesize %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<?x?xnone>>) -> !fir.box<none>
+! CHECK:           %[[VAL_9:.*]] = fir.call @_FortranASize(%[[VAL_7]], %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i64
+! CHECK:           %[[VAL_10:.*]] = arith.muli %[[VAL_4]], %[[VAL_9]] : i64
+! CHECK:           hlfir.assign %[[VAL_10]] to %{{.*}} : i64, !fir.ref<i64>
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 893a807b5b61c..5bc0898298ce3 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -108,7 +108,7 @@ function(_get_common_compile_options output_var flags)
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
-function(_get_common_test_compile_options output_var flags)
+function(_get_common_test_compile_options output_var c_test flags)
   _get_compile_options_from_flags(compile_flags ${flags})
 
   set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags})
@@ -122,7 +122,9 @@ function(_get_common_test_compile_options output_var flags)
       list(APPEND compile_options "-fno-exceptions")
       list(APPEND compile_options "-fno-unwind-tables")
       list(APPEND compile_options "-fno-asynchronous-unwind-tables")
-      list(APPEND compile_options "-fno-rtti")
+      if(NOT ${c_test})
+        list(APPEND compile_options "-fno-rtti")
+      endif()
     endif()
 
     if(LIBC_COMPILER_HAS_FIXED_POINT)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 0bdd72091fe85..eb6be91b55e26 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -111,7 +111,7 @@ function(create_libc_unittest fq_target_name)
 
   cmake_parse_arguments(
     "LIBC_UNITTEST"
-    "NO_RUN_POSTBUILD" # Optional arguments
+    "NO_RUN_POSTBUILD;C_TEST" # Optional arguments
     "SUITE;CXX_STANDARD" # Single value arguments
     "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;LINK_LIBRARIES;FLAGS" # Multi-value arguments
     ${ARGN}
@@ -126,11 +126,14 @@ function(create_libc_unittest fq_target_name)
   endif()
 
   get_fq_deps_list(fq_deps_list ${LIBC_UNITTEST_DEPENDS})
-  list(APPEND fq_deps_list libc.src.__support.StringUtil.error_to_string
-                           libc.test.UnitTest.ErrnoSetterMatcher)
+  if(NOT LIBC_UNITTEST_C_TEST)
+    list(APPEND fq_deps_list libc.src.__support.StringUtil.error_to_string
+                             libc.test.UnitTest.ErrnoSetterMatcher)
+  endif()
   list(REMOVE_DUPLICATES fq_deps_list)
 
-  _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_FLAGS}")
+  _get_common_test_compile_options(compile_options "${LIBC_UNITTEST_C_TEST}"
+                                   "${LIBC_UNITTEST_FLAGS}")
   list(APPEND compile_options ${LIBC_UNITTEST_COMPILE_OPTIONS})
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -214,7 +217,9 @@ function(create_libc_unittest fq_target_name)
   )
 
   # LibcUnitTest should not depend on anything in LINK_LIBRARIES.
-  list(APPEND link_libraries LibcDeathTestExecutors.unit LibcTest.unit)
+  if(NOT LIBC_UNITTEST_C_TEST)
+    list(APPEND link_libraries LibcDeathTestExecutors.unit LibcTest.unit)
+  endif()
 
   target_link_libraries(${fq_build_target_name} PRIVATE ${link_libraries})
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index b447b5dfe0989..1656973cb27c8 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -335,6 +335,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fminl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -426,6 +427,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5175b14adf2e7..07d1acfcfe079 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -343,6 +343,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -434,6 +435,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b8bec14a3d2a6..e0324061a9c78 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -376,6 +376,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
@@ -469,6 +470,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.floorf128
     libc.src.math.fmaxf128
     libc.src.math.fminf128
+    libc.src.math.fmodf128
     libc.src.math.frexpf128
     libc.src.math.ilogbf128
     libc.src.math.ldexpf128
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 1c9ed7bbcfed6..d6227a427afe2 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -155,6 +155,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmaxl
     libc.src.math.fmod
     libc.src.math.fmodf
+    libc.src.math.fmodl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index b22ed5127c179..6984b785125f1 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -169,7 +169,9 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmodf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmodl        |         |         |         |         |         |         |         |         |         |         |         |         |
+| fmodl        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| fmodf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | frexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index d91f5c1f72334..1f14fe758130e 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -405,8 +405,9 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
-
           FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
+          FunctionSpec<"fmodl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"fmodf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 7b3882dde1b72..b06b3f7b73959 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -640,6 +640,7 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
   using UP::EXP_MASK;
   using UP::FRACTION_MASK;
   using UP::SIG_LEN;
+  using UP::SIG_MASK;
   using UP::SIGN_MASK;
   LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
       (1 << UP::EXP_LEN) - 1;
@@ -729,6 +730,9 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
     bits = UP::merge(bits, mantVal, FRACTION_MASK);
   }
 
+  LIBC_INLINE constexpr void set_significand(StorageType sigVal) {
+    bits = UP::merge(bits, sigVal, SIG_MASK);
+  }
   // Unsafe function to create a floating point representation.
   // It simply packs the sign, biased exponent and mantissa values without
   // checking bound nor normalization.
@@ -755,20 +759,19 @@ struct FPRepImpl : public FPRepSem<fp_type, RetT> {
   //   4) "number" zero value is not processed correctly.
   //   5) Number is unsigned, so the result can be only positive.
   LIBC_INLINE static constexpr RetT make_value(StorageType number, int ep) {
-    static_assert(fp_type != FPType::X86_Binary80,
-                  "This function is not tested for X86 Extended Precision");
-    FPRepImpl result;
-    // offset: +1 for sign, but -1 for implicit first bit
-    int lz = cpp::countl_zero(number) - UP::EXP_LEN;
+    FPRepImpl result(0);
+    int lz =
+        UP::FRACTION_LEN + 1 - (UP::STORAGE_LEN - cpp::countl_zero(number));
+
     number <<= lz;
     ep -= lz;
 
     if (LIBC_LIKELY(ep >= 0)) {
       // Implicit number bit will be removed by mask
-      result.set_mantissa(number);
+      result.set_significand(number);
       result.set_biased_exponent(ep + 1);
     } else {
-      result.set_mantissa(number >> -ep);
+      result.set_significand(number >> -ep);
     }
     return RetT(result.uintval());
   }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index 2d31290bc4bc2..24fb264b779b7 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -117,63 +117,9 @@ namespace generic {
 // be implemented in another handler.
 // Signaling NaN converted to quiet NaN with FE_INVALID exception.
 //    https://www.open-std.org/JTC1/SC22/WG14/www/docs/n1011.htm
-template <typename T> struct FModExceptionalInputHandler {
-
-  static_assert(cpp::is_floating_point_v<T>,
-                "FModCStandardWrapper instantiated with invalid type.");
-
-  LIBC_INLINE static bool pre_check(T x, T y, T &out) {
-    using FPB = fputil::FPBits<T>;
-    const T quiet_nan = FPB::quiet_nan().get_val();
-    FPB sx(x), sy(y);
-    if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
-                    !sx.is_inf_or_nan())) {
-      return false;
-    }
-
-    if (sx.is_nan() || sy.is_nan()) {
-      if ((sx.is_nan() && !sx.is_quiet_nan()) ||
-          (sy.is_nan() && !sy.is_quiet_nan()))
-        fputil::raise_except_if_required(FE_INVALID);
-      out = quiet_nan;
-      return true;
-    }
-
-    if (sx.is_inf() || sy.is_zero()) {
-      fputil::raise_except_if_required(FE_INVALID);
-      fputil::set_errno_if_required(EDOM);
-      out = quiet_nan;
-      return true;
-    }
-
-    if (sy.is_inf()) {
-      out = x;
-      return true;
-    }
-
-    // case where x == 0
-    out = x;
-    return true;
-  }
-};
-
-template <typename T> struct FModFastMathWrapper {
-
-  static_assert(cpp::is_floating_point_v<T>,
-                "FModFastMathWrapper instantiated with invalid type.");
-
-  static bool pre_check(T, T, T &) { return false; }
-};
-
-template <typename T> class FModDivisionSimpleHelper {
-private:
-  using StorageType = typename FPBits<T>::StorageType;
-
-public:
-  LIBC_INLINE constexpr static StorageType execute(int exp_diff,
-                                                   int sides_zeroes_count,
-                                                   StorageType m_x,
-                                                   StorageType m_y) {
+template <typename T> struct FModDivisionSimpleHelper {
+  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
+                                         T m_x, T m_y) {
     while (exp_diff > sides_zeroes_count) {
       exp_diff -= sides_zeroes_count;
       m_x <<= sides_zeroes_count;
@@ -185,28 +131,21 @@ template <typename T> class FModDivisionSimpleHelper {
   }
 };
 
-template <typename T> class FModDivisionInvMultHelper {
-private:
-  using FPB = FPBits<T>;
-  using StorageType = typename FPB::StorageType;
-
-public:
-  LIBC_INLINE constexpr static StorageType execute(int exp_diff,
-                                                   int sides_zeroes_count,
-                                                   StorageType m_x,
-                                                   StorageType m_y) {
+template <typename T> struct FModDivisionInvMultHelper {
+  LIBC_INLINE constexpr static T execute(int exp_diff, int sides_zeroes_count,
+                                         T m_x, T m_y) {
+    constexpr int LENGTH = sizeof(T) * CHAR_BIT;
     if (exp_diff > sides_zeroes_count) {
-      StorageType inv_hy = (cpp::numeric_limits<StorageType>::max() / m_y);
+      T inv_hy = (cpp::numeric_limits<T>::max() / m_y);
       while (exp_diff > sides_zeroes_count) {
         exp_diff -= sides_zeroes_count;
-        StorageType hd =
-            (m_x * inv_hy) >> (FPB::TOTAL_LEN - sides_zeroes_count);
+        T hd = (m_x * inv_hy) >> (LENGTH - sides_zeroes_count);
         m_x <<= sides_zeroes_count;
         m_x -= hd * m_y;
         while (LIBC_UNLIKELY(m_x > m_y))
           m_x -= m_y;
       }
-      StorageType hd = (m_x * inv_hy) >> (FPB::TOTAL_LEN - exp_diff);
+      T hd = (m_x * inv_hy) >> (LENGTH - exp_diff);
       m_x <<= exp_diff;
       m_x -= hd * m_y;
       while (LIBC_UNLIKELY(m_x > m_y))
@@ -219,22 +158,49 @@ template <typename T> class FModDivisionInvMultHelper {
   }
 };
 
-template <typename T, class Wrapper = FModExceptionalInputHandler<T>,
-          class DivisionHelper = FModDivisionSimpleHelper<T>>
+template <typename T, typename U = typename FPBits<T>::StorageType,
+          typename DivisionHelper = FModDivisionSimpleHelper<U>>
 class FMod {
-  static_assert(cpp::is_floating_point_v<T>,
+  static_assert(cpp::is_floating_point_v<T> && cpp::is_unsigned_v<U> &&
+                    (sizeof(U) * CHAR_BIT > FPBits<T>::FRACTION_LEN),
                 "FMod instantiated with invalid type.");
 
 private:
   using FPB = FPBits<T>;
   using StorageType = typename FPB::StorageType;
 
+  LIBC_INLINE static bool pre_check(T x, T y, T &out) {
+    using FPB = fputil::FPBits<T>;
+    const T quiet_nan = FPB::quiet_nan().get_val();
+    FPB sx(x), sy(y);
+    if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
+                    !sx.is_inf_or_nan()))
+      return false;
+
+    if (sx.is_nan() || sy.is_nan()) {
+      if (sx.is_signaling_nan() || sy.is_signaling_nan())
+        fputil::raise_except_if_required(FE_INVALID);
+      out = quiet_nan;
+      return true;
+    }
+
+    if (sx.is_inf() || sy.is_zero()) {
+      fputil::raise_except_if_required(FE_INVALID);
+      fputil::set_errno_if_required(EDOM);
+      out = quiet_nan;
+      return true;
+    }
+
+    out = x;
+    return true;
+  }
+
   LIBC_INLINE static constexpr FPB eval_internal(FPB sx, FPB sy) {
 
     if (LIBC_LIKELY(sx.uintval() <= sy.uintval())) {
       if (sx.uintval() < sy.uintval())
         return sx;             // |x|<|y| return x
-      return FPB(FPB::zero()); // |x|=|y| return 0.0
+      return FPB::zero();      // |x|=|y| return 0.0
     }
 
     int e_x = sx.get_biased_exponent();
@@ -247,11 +213,11 @@ class FMod {
       StorageType m_y = sy.get_explicit_mantissa();
       StorageType d = (e_x == e_y) ? (m_x - m_y) : (m_x << (e_x - e_y)) % m_y;
       if (d == 0)
-        return FPB(FPB::zero());
+        return FPB::zero();
       // iy - 1 because of "zero power" for number with power 1
       return FPB::make_value(d, e_y - 1);
     }
-    /* Both subnormal special case. */
+    // Both subnormal special case.
     if (LIBC_UNLIKELY(e_x == 0 && e_y == 0)) {
       FPB d;
       d.set_mantissa(sx.uintval() % sy.uintval());
@@ -259,15 +225,17 @@ class FMod {
     }
 
     // Note that hx is not subnormal by conditions above.
-    StorageType m_x = sx.get_explicit_mantissa();
+    U m_x = static_cast<U>(sx.get_explicit_mantissa());
     e_x--;
 
-    StorageType m_y = sy.get_explicit_mantissa();
-    int lead_zeros_m_y = FPB::EXP_LEN;
+    U m_y = static_cast<U>(sy.get_explicit_mantissa());
+    constexpr int DEFAULT_LEAD_ZEROS =
+        sizeof(U) * CHAR_BIT - FPB::FRACTION_LEN - 1;
+    int lead_zeros_m_y = DEFAULT_LEAD_ZEROS;
     if (LIBC_LIKELY(e_y > 0)) {
       e_y--;
     } else {
-      m_y = sy.get_mantissa();
+      m_y = static_cast<U>(sy.get_mantissa());
       lead_zeros_m_y = cpp::countl_zero(m_y);
     }
 
@@ -286,26 +254,27 @@ class FMod {
 
     {
       // Shift hx left until the end or n = 0
-      int left_shift = exp_diff < int(FPB::EXP_LEN) ? exp_diff : FPB::EXP_LEN;
+      int left_shift =
+          exp_diff < DEFAULT_LEAD_ZEROS ? exp_diff : DEFAULT_LEAD_ZEROS;
       m_x <<= left_shift;
       exp_diff -= left_shift;
     }
 
     m_x %= m_y;
     if (LIBC_UNLIKELY(m_x == 0))
-      return FPB(FPB::zero());
+      return FPB::zero();
 
     if (exp_diff == 0)
-      return FPB::make_value(m_x, e_y);
+      return FPB::make_value(static_cast<StorageType>(m_x), e_y);
 
-    /* hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0 */
+    // hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0
     m_x = DivisionHelper::execute(exp_diff, sides_zeroes_count, m_x, m_y);
-    return FPB::make_value(m_x, e_y);
+    return FPB::make_value(static_cast<StorageType>(m_x), e_y);
   }
 
 public:
   LIBC_INLINE static T eval(T x, T y) {
-    if (T out; Wrapper::pre_check(x, y, out))
+    if (T out; LIBC_UNLIKELY(pre_check(x, y, out)))
       return out;
     FPB sx(x), sy(y);
     Sign sign = sx.sign();
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 6c06d383ec2b0..bba02aa78a231 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -119,6 +119,8 @@ add_math_entrypoint_object(fminf128)
 
 add_math_entrypoint_object(fmod)
 add_math_entrypoint_object(fmodf)
+add_math_entrypoint_object(fmodl)
+add_math_entrypoint_object(fmodf128)
 
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
diff --git a/libc/src/math/fmodf128.h b/libc/src/math/fmodf128.h
new file mode 100644
index 0000000000000..b3242705f025e
--- /dev/null
+++ b/libc/src/math/fmodf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fmodf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODF128_H
+#define LLVM_LIBC_SRC_MATH_FMODF128_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 fmodf128(float128 x, float128 y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMODF128_H
diff --git a/libc/src/math/fmodl.h b/libc/src/math/fmodl.h
new file mode 100644
index 0000000000000..f259ddb238a8e
--- /dev/null
+++ b/libc/src/math/fmodl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmodl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODL_H
+#define LLVM_LIBC_SRC_MATH_FMODL_H
+
+namespace LIBC_NAMESPACE {
+
+long double fmodl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_FMODL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 933a05dad157c..bc4e9b34cfc2f 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1859,7 +1859,6 @@ add_entrypoint_object(
   HDRS
     ../fmod.h
   DEPENDS
-    libc.include.math
     libc.src.__support.FPUtil.generic.fmod
   COMPILE_OPTIONS
     -O3
@@ -1872,7 +1871,31 @@ add_entrypoint_object(
   HDRS
     ../fmodf.h
   DEPENDS
-    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodl
+  SRCS
+    fmodl.cpp
+  HDRS
+    ../fmodl.h
+  DEPENDS
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodf128
+  SRCS
+    fmodf128.cpp
+  HDRS
+    ../fmodf128.h
+  DEPENDS
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.generic.fmod
   COMPILE_OPTIONS
     -O3
diff --git a/libc/src/math/generic/fmodf.cpp b/libc/src/math/generic/fmodf.cpp
index 7a29ff1f18d31..9a9e46e29b466 100644
--- a/libc/src/math/generic/fmodf.cpp
+++ b/libc/src/math/generic/fmodf.cpp
@@ -13,7 +13,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
-  return fputil::generic::FMod<float>::eval(x, y);
+  return fputil::generic::FMod<float, uint64_t>::eval(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fmodf128.cpp b/libc/src/math/generic/fmodf128.cpp
new file mode 100644
index 0000000000000..08a379702d889
--- /dev/null
+++ b/libc/src/math/generic/fmodf128.cpp
@@ -0,0 +1,19 @@
+//===-- Single-precision fmodf128 function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf128.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, fmodf128, (float128 x, float128 y)) {
+  return fputil::generic::FMod<float128>::eval(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fmodl.cpp b/libc/src/math/generic/fmodl.cpp
new file mode 100644
index 0000000000000..23a3702890557
--- /dev/null
+++ b/libc/src/math/generic/fmodl.cpp
@@ -0,0 +1,19 @@
+//===-- Single-precision fmodl function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodl.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long double, fmodl, (long double x, long double y)) {
+  return fputil::generic::FMod<long double>::eval(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index bf845c94170f9..8d8dff53169f6 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -22,16 +22,42 @@ if(LLVM_LIBC_FULL_BUILD AND libc.include.stdbit IN_LIST TARGET_PUBLIC_HEADERS)
     stdbit_test
     SUITE
       libc_include_tests
+    HDRS
+      stdbit_stub.h
     SRCS
       stdbit_test.cpp
     DEPENDS
       libc.include.llvm-libc-macros.stdbit_macros
+      libc.include.llvm_libc_common_h
       libc.include.stdbit
       # Intentionally do not depend on libc.src.stdbit.*. The include test is
       # simply testing the macros provided by stdbit.h, not the implementation
       # of the underlying functions which the type generic macros may dispatch
       # to.
   )
+  add_libc_test(
+    stdbit_c_test
+    C_TEST
+    UNIT_TEST_ONLY
+    SUITE
+      libc_include_tests
+    HDRS
+      stdbit_stub.h
+    SRCS
+      stdbit_test.c
+    COMPILE_OPTIONS
+      -Wall
+      -Werror
+    DEPENDS
+      libc.include.llvm-libc-macros.stdbit_macros
+      libc.include.llvm_libc_common_h
+      libc.include.stdbit
+      libc.src.assert.__assert_fail
+      # Intentionally do not depend on libc.src.stdbit.*. The include test is
+      # simply testing the macros provided by stdbit.h, not the implementation
+      # of the underlying functions which the type generic macros may dispatch
+      # to.
+  )
 endif()
 
 add_libc_test(
diff --git a/libc/test/include/stdbit_stub.h b/libc/test/include/stdbit_stub.h
new file mode 100644
index 0000000000000..65b1ca3b2c297
--- /dev/null
+++ b/libc/test/include/stdbit_stub.h
@@ -0,0 +1,73 @@
+//===-- Utilities for testing stdbit --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*
+ * Declare these BEFORE including stdbit-macros.h so that this test may still be
+ * run even if a given target doesn't yet have these individual entrypoints
+ * enabled.
+ */
+
+#include "include/__llvm-libc-common.h"
+
+#include <stdbool.h> // bool in C
+
+#define STDBIT_STUB_FUNCTION(FUNC_NAME, LEADING_VAL)                           \
+  unsigned FUNC_NAME##_uc(unsigned char x) __NOEXCEPT {                        \
+    return LEADING_VAL##AU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_us(unsigned short x) __NOEXCEPT {                       \
+    return LEADING_VAL##BU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ui(unsigned int x) __NOEXCEPT {                         \
+    return LEADING_VAL##CU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ul(unsigned long x) __NOEXCEPT {                        \
+    return LEADING_VAL##DU;                                                    \
+  }                                                                            \
+  unsigned FUNC_NAME##_ull(unsigned long long x) __NOEXCEPT {                  \
+    return LEADING_VAL##EU;                                                    \
+  }
+
+__BEGIN_C_DECLS
+
+STDBIT_STUB_FUNCTION(stdc_leading_zeros, 0xA)
+STDBIT_STUB_FUNCTION(stdc_leading_ones, 0xB)
+STDBIT_STUB_FUNCTION(stdc_trailing_zeros, 0xC)
+STDBIT_STUB_FUNCTION(stdc_trailing_ones, 0xD)
+STDBIT_STUB_FUNCTION(stdc_first_leading_zero, 0xE)
+STDBIT_STUB_FUNCTION(stdc_first_leading_one, 0xF)
+STDBIT_STUB_FUNCTION(stdc_first_trailing_zero, 0x0)
+STDBIT_STUB_FUNCTION(stdc_first_trailing_one, 0x1)
+STDBIT_STUB_FUNCTION(stdc_count_zeros, 0x2)
+STDBIT_STUB_FUNCTION(stdc_count_ones, 0x3)
+
+bool stdc_has_single_bit_uc(unsigned char x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_us(unsigned short x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ui(unsigned x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ul(unsigned long x) __NOEXCEPT { return false; }
+bool stdc_has_single_bit_ull(unsigned long long x) __NOEXCEPT { return false; }
+
+STDBIT_STUB_FUNCTION(stdc_bit_width, 0x4)
+
+unsigned char stdc_bit_floor_uc(unsigned char x) __NOEXCEPT { return 0x5AU; }
+unsigned short stdc_bit_floor_us(unsigned short x) __NOEXCEPT { return 0x5BU; }
+unsigned stdc_bit_floor_ui(unsigned x) __NOEXCEPT { return 0x5CU; }
+unsigned long stdc_bit_floor_ul(unsigned long x) __NOEXCEPT { return 0x5DUL; }
+unsigned long long stdc_bit_floor_ull(unsigned long long x) __NOEXCEPT {
+  return 0x5EULL;
+}
+
+unsigned char stdc_bit_ceil_uc(unsigned char x) __NOEXCEPT { return 0x6AU; }
+unsigned short stdc_bit_ceil_us(unsigned short x) __NOEXCEPT { return 0x6BU; }
+unsigned stdc_bit_ceil_ui(unsigned x) __NOEXCEPT { return 0x6CU; }
+unsigned long stdc_bit_ceil_ul(unsigned long x) __NOEXCEPT { return 0x6DUL; }
+unsigned long long stdc_bit_ceil_ull(unsigned long long x) __NOEXCEPT {
+  return 0x6EULL;
+}
+
+__END_C_DECLS
diff --git a/libc/test/include/stdbit_test.c b/libc/test/include/stdbit_test.c
new file mode 100644
index 0000000000000..e278e9a7374e0
--- /dev/null
+++ b/libc/test/include/stdbit_test.c
@@ -0,0 +1,61 @@
+//===-- Unittests for stdbit ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/*
+ * The intent of this test is validate that:
+ * 1. We provide the definition of the various type generic macros of stdbit.h
+ * (the macros are transitively included from stdbit-macros.h by stdbit.h).
+ * 2. It dispatches to the correct underlying function.
+ * Because unit tests build without public packaging, the object files produced
+ * do not contain non-namespaced symbols.
+ */
+
+/*
+ * Declare these BEFORE including stdbit-macros.h so that this test may still be
+ * run even if a given target doesn't yet have these individual entrypoints
+ * enabled.
+ */
+#include "stdbit_stub.h"
+
+#include "include/llvm-libc-macros/stdbit-macros.h"
+
+#include <assert.h>
+
+#define CHECK_FUNCTION(FUNC_NAME, VAL)                                         \
+  do {                                                                         \
+    assert(FUNC_NAME((unsigned char)0U) == VAL##AU);                           \
+    assert(FUNC_NAME((unsigned short)0U) == VAL##BU);                          \
+    assert(FUNC_NAME(0U) == VAL##CU);                                          \
+    assert(FUNC_NAME(0UL) == VAL##DU);                                         \
+    assert(FUNC_NAME(0ULL) == VAL##EU);                                        \
+  } while (0)
+
+int main(void) {
+  CHECK_FUNCTION(stdc_leading_zeros, 0xA);
+  CHECK_FUNCTION(stdc_leading_ones, 0xB);
+  CHECK_FUNCTION(stdc_trailing_zeros, 0xC);
+  CHECK_FUNCTION(stdc_trailing_ones, 0xD);
+  CHECK_FUNCTION(stdc_first_leading_zero, 0xE);
+  CHECK_FUNCTION(stdc_first_leading_one, 0xF);
+  CHECK_FUNCTION(stdc_first_trailing_zero, 0x0);
+  CHECK_FUNCTION(stdc_first_trailing_one, 0x1);
+  CHECK_FUNCTION(stdc_count_zeros, 0x2);
+  CHECK_FUNCTION(stdc_count_ones, 0x3);
+
+  assert(!stdc_has_single_bit((unsigned char)1U));
+  assert(!stdc_has_single_bit((unsigned short)1U));
+  assert(!stdc_has_single_bit(1U));
+  assert(!stdc_has_single_bit(1UL));
+  assert(!stdc_has_single_bit(1ULL));
+
+  CHECK_FUNCTION(stdc_bit_width, 0x4);
+  CHECK_FUNCTION(stdc_bit_floor, 0x5);
+  CHECK_FUNCTION(stdc_bit_ceil, 0x6);
+
+  return 0;
+}
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 6c12665c4454d..f3227eb86959e 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -22,90 +22,7 @@
  * run even if a given target doesn't yet have these individual entrypoints
  * enabled.
  */
-extern "C" {
-unsigned stdc_leading_zeros_uc(unsigned char) noexcept { return 0xAAU; }
-unsigned stdc_leading_zeros_us(unsigned short) noexcept { return 0xABU; }
-unsigned stdc_leading_zeros_ui(unsigned) noexcept { return 0xACU; }
-unsigned stdc_leading_zeros_ul(unsigned long) noexcept { return 0xADU; }
-unsigned stdc_leading_zeros_ull(unsigned long long) noexcept { return 0xAEU; }
-unsigned stdc_leading_ones_uc(unsigned char) noexcept { return 0xBAU; }
-unsigned stdc_leading_ones_us(unsigned short) noexcept { return 0xBBU; }
-unsigned stdc_leading_ones_ui(unsigned) noexcept { return 0xBCU; }
-unsigned stdc_leading_ones_ul(unsigned long) noexcept { return 0xBDU; }
-unsigned stdc_leading_ones_ull(unsigned long long) noexcept { return 0xBEU; }
-unsigned stdc_trailing_zeros_uc(unsigned char) noexcept { return 0xCAU; }
-unsigned stdc_trailing_zeros_us(unsigned short) noexcept { return 0xCBU; }
-unsigned stdc_trailing_zeros_ui(unsigned) noexcept { return 0xCCU; }
-unsigned stdc_trailing_zeros_ul(unsigned long) noexcept { return 0xCDU; }
-unsigned stdc_trailing_zeros_ull(unsigned long long) noexcept { return 0xCEU; }
-unsigned stdc_trailing_ones_uc(unsigned char) noexcept { return 0xDAU; }
-unsigned stdc_trailing_ones_us(unsigned short) noexcept { return 0xDBU; }
-unsigned stdc_trailing_ones_ui(unsigned) noexcept { return 0xDCU; }
-unsigned stdc_trailing_ones_ul(unsigned long) noexcept { return 0xDDU; }
-unsigned stdc_trailing_ones_ull(unsigned long long) noexcept { return 0xDEU; }
-unsigned stdc_first_leading_zero_uc(unsigned char) noexcept { return 0xEAU; }
-unsigned stdc_first_leading_zero_us(unsigned short) noexcept { return 0xEBU; }
-unsigned stdc_first_leading_zero_ui(unsigned) noexcept { return 0xECU; }
-unsigned stdc_first_leading_zero_ul(unsigned long) noexcept { return 0xEDU; }
-unsigned stdc_first_leading_zero_ull(unsigned long long) noexcept {
-  return 0xEEU;
-}
-unsigned stdc_first_leading_one_uc(unsigned char) noexcept { return 0xFAU; }
-unsigned stdc_first_leading_one_us(unsigned short) noexcept { return 0xFBU; }
-unsigned stdc_first_leading_one_ui(unsigned) noexcept { return 0xFCU; }
-unsigned stdc_first_leading_one_ul(unsigned long) noexcept { return 0xFDU; }
-unsigned stdc_first_leading_one_ull(unsigned long long) noexcept {
-  return 0xFEU;
-}
-unsigned stdc_first_trailing_zero_uc(unsigned char) noexcept { return 0x0AU; }
-unsigned stdc_first_trailing_zero_us(unsigned short) noexcept { return 0x0BU; }
-unsigned stdc_first_trailing_zero_ui(unsigned) noexcept { return 0x0CU; }
-unsigned stdc_first_trailing_zero_ul(unsigned long) noexcept { return 0x0DU; }
-unsigned stdc_first_trailing_zero_ull(unsigned long long) noexcept {
-  return 0x0EU;
-}
-unsigned stdc_first_trailing_one_uc(unsigned char) noexcept { return 0x1AU; }
-unsigned stdc_first_trailing_one_us(unsigned short) noexcept { return 0x1BU; }
-unsigned stdc_first_trailing_one_ui(unsigned) noexcept { return 0x1CU; }
-unsigned stdc_first_trailing_one_ul(unsigned long) noexcept { return 0x1DU; }
-unsigned stdc_first_trailing_one_ull(unsigned long long) noexcept {
-  return 0x1EU;
-}
-unsigned stdc_count_zeros_uc(unsigned char) noexcept { return 0x2AU; }
-unsigned stdc_count_zeros_us(unsigned short) noexcept { return 0x2BU; }
-unsigned stdc_count_zeros_ui(unsigned) noexcept { return 0x2CU; }
-unsigned stdc_count_zeros_ul(unsigned long) noexcept { return 0x2DU; }
-unsigned stdc_count_zeros_ull(unsigned long long) noexcept { return 0x2EU; }
-unsigned stdc_count_ones_uc(unsigned char) noexcept { return 0x3AU; }
-unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
-unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
-unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
-unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3EU; }
-bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
-bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
-bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
-bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
-bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
-unsigned stdc_bit_width_uc(unsigned char) noexcept { return 0x4AU; }
-unsigned stdc_bit_width_us(unsigned short) noexcept { return 0x4BU; }
-unsigned stdc_bit_width_ui(unsigned) noexcept { return 0x4CU; }
-unsigned stdc_bit_width_ul(unsigned long) noexcept { return 0x4DU; }
-unsigned stdc_bit_width_ull(unsigned long long) noexcept { return 0x4EU; }
-unsigned char stdc_bit_floor_uc(unsigned char) noexcept { return 0x5AU; }
-unsigned short stdc_bit_floor_us(unsigned short) noexcept { return 0x5BU; }
-unsigned stdc_bit_floor_ui(unsigned) noexcept { return 0x5CU; }
-unsigned long stdc_bit_floor_ul(unsigned long) noexcept { return 0x5DU; }
-unsigned long long stdc_bit_floor_ull(unsigned long long) noexcept {
-  return 0x5EU;
-}
-unsigned char stdc_bit_ceil_uc(unsigned char) noexcept { return 0x6AU; }
-unsigned short stdc_bit_ceil_us(unsigned short) noexcept { return 0x6BU; }
-unsigned stdc_bit_ceil_ui(unsigned) noexcept { return 0x6CU; }
-unsigned long stdc_bit_ceil_ul(unsigned long) noexcept { return 0x6DU; }
-unsigned long long stdc_bit_ceil_ull(unsigned long long) noexcept {
-  return 0x6EU;
-}
-}
+#include "stdbit_stub.h"
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
 
diff --git a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
index b47d24c54869b..25a5e3898599a 100644
--- a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@@ -19,10 +19,11 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 template <typename T, bool InverseMultiplication>
 class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
+  using U = typename LIBC_NAMESPACE::fputil::FPBits<T>::StorageType;
   using DivisionHelper = LIBC_NAMESPACE::cpp::conditional_t<
       InverseMultiplication,
-      LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<T>,
-      LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<T>>;
+      LIBC_NAMESPACE::fputil::generic::FModDivisionInvMultHelper<U>,
+      LIBC_NAMESPACE::fputil::generic::FModDivisionSimpleHelper<U>>;
 
   static constexpr std::array<T, 11> test_bases = {
       T(0.0),
@@ -39,9 +40,7 @@ class LlvmLibcFModTest : public LIBC_NAMESPACE::testing::Test {
 
 public:
   void testExtensive() {
-    using FMod = LIBC_NAMESPACE::fputil::generic::FMod<
-        T, LIBC_NAMESPACE::fputil::generic::FModFastMathWrapper<T>,
-        DivisionHelper>;
+    using FMod = LIBC_NAMESPACE::fputil::generic::FMod<T, U, DivisionHelper>;
     using nl = std::numeric_limits<T>;
     int min2 = nl::min_exponent - nl::digits - 5;
     int max2 = nl::max_exponent + 3;
diff --git a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
index 68d37b46b77c7..504d1be94b891 100644
--- a/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
+++ b/libc/test/src/math/performance_testing/BinaryOpSingleOutputPerf.h
@@ -86,7 +86,7 @@ template <typename T> class BinaryOpSingleOutputPerf {
            "close to each other:\n";
     run_perf_in_range(
         myFunc, otherFunc, /* startingBit= */ FPBits(T(0x1.0p-10)).uintval(),
-        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 10'000'001, log);
+        /* endingBit= */ FPBits(T(0x1.0p+10)).uintval(), 1'001'001, log);
   }
 
   static void run_diff(Func myFunc, Func otherFunc, const char *logFile) {
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index d20c2eb303a7c..d1fb24e37f728 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -331,3 +331,25 @@ add_perf_binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_perf_binary(
+  fmodl_perf
+  SRCS
+    fmodl_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodl
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_perf_binary(
+  fmodf128_perf
+  SRCS
+    fmodf128_perf.cpp
+  DEPENDS
+    .single_input_single_output_diff
+    libc.src.math.fmodf128
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/fmodf128_perf.cpp b/libc/test/src/math/performance_testing/fmodf128_perf.cpp
new file mode 100644
index 0000000000000..8165e9254dd56
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmodf128_perf.cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodf128 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf128.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(float, LIBC_NAMESPACE::fmodf128, ::fmodf128,
+                             "fmodf128_perf.log")
diff --git a/libc/test/src/math/performance_testing/fmodl_perf.cpp b/libc/test/src/math/performance_testing/fmodl_perf.cpp
new file mode 100644
index 0000000000000..aefdf2d6b42fc
--- /dev/null
+++ b/libc/test/src/math/performance_testing/fmodl_perf.cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodl ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodl.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(long double, LIBC_NAMESPACE::fmodl, ::fmodl,
+                             "fmodl_perf.log")
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 8d3871dd427aa..d9be172056a83 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1793,6 +1793,42 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  fmodl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmodl_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.errno.errno
+    libc.src.math.fmodl
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.nearest_integer_operations
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
+add_fp_unittest(
+  fmodf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmodf128_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.errno.errno
+    libc.src.math.fmodf128
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.nearest_integer_operations
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   coshf_test
   SUITE
diff --git a/libc/test/src/math/smoke/fmodf128_test.cpp b/libc/test/src/math/smoke/fmodf128_test.cpp
new file mode 100644
index 0000000000000..f75aadac84386
--- /dev/null
+++ b/libc/test/src/math/smoke/fmodf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodf128.h"
+
+LIST_FMOD_TESTS(float128, LIBC_NAMESPACE::fmodf128)
diff --git a/libc/test/src/math/smoke/fmodl_test.cpp b/libc/test/src/math/smoke/fmodl_test.cpp
new file mode 100644
index 0000000000000..b69ed8ec85c84
--- /dev/null
+++ b/libc/test/src/math/smoke/fmodl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodl.h"
+
+LIST_FMOD_TESTS(long double, LIBC_NAMESPACE::fmodl)
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 567f85e37bfab..a0dddd2f97b58 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,7 @@ add_libc_test(
     libc.src.__support.CPP.array
     libc.src.__support.CPP.cstddef
     libc.src.__support.CPP.span
+    libc.src.__support.macros.properties.types
     libc.src.__support.macros.sanitizer
     libc.src.string.memory_utils.memory_utils
   UNIT_TEST_ONLY
diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
index 95a04755eb4db..703a26b16b03f 100644
--- a/libc/test/src/string/memory_utils/op_tests.cpp
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -10,6 +10,7 @@
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64
 #include "src/string/memory_utils/op_aarch64.h"
 #include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
 #include "src/string/memory_utils/op_riscv.h"
 #include "src/string/memory_utils/op_x86.h"
 #include "test/UnitTest/Test.h"
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index ec7d694c4a55f..e6691e78a267f 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -284,13 +284,21 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_iterator() _NOEXCEPT : __node_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container iterator");
     __node_ = __node_->__next_;
     return *this;
   }
@@ -345,12 +353,20 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_iterator(const __non_const_iterator& __x) _NOEXCEPT : __node_(__x.__node_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    return __node_->__upcast()->__get_value();
+  }
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container const_iterator");
     __node_ = __node_->__next_;
     return *this;
   }
@@ -400,13 +416,21 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI __hash_local_iterator() _NOEXCEPT : __node_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_local_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container local_iterator");
     __node_ = __node_->__next_;
     if (__node_ != nullptr && std::__constrain_hash(__node_->__hash(), __bucket_count_) != __bucket_)
       __node_ = nullptr;
@@ -475,13 +499,21 @@ public:
         __bucket_(__x.__bucket_),
         __bucket_count_(__x.__bucket_count_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __node_->__upcast()->__get_value(); }
+  _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    return __node_->__upcast()->__get_value();
+  }
 
   _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
     return pointer_traits<pointer>::pointer_to(__node_->__upcast()->__get_value());
   }
 
   _LIBCPP_HIDE_FROM_ABI __hash_const_local_iterator& operator++() {
+    _LIBCPP_ASSERT_NON_NULL(
+        __node_ != nullptr, "Attempted to increment a non-incrementable unordered container const_local_iterator");
     __node_ = __node_->__next_;
     if (__node_ != nullptr && std::__constrain_hash(__node_->__hash(), __bucket_count_) != __bucket_)
       __node_ = nullptr;
diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h
index 906ba3df0c578..a1a941ffbaaf1 100644
--- a/libcxx/include/__iterator/bounded_iter.h
+++ b/libcxx/include/__iterator/bounded_iter.h
@@ -31,13 +31,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Iterator wrapper that carries the valid range it is allowed to access.
 //
 // This is a simple iterator wrapper for contiguous iterators that points
-// within a [begin, end) range and carries these bounds with it. The iterator
-// ensures that it is pointing within that [begin, end) range when it is
-// dereferenced.
+// within a [begin, end] range and carries these bounds with it. The iterator
+// ensures that it is pointing within [begin, end) range when it is
+// dereferenced. It also ensures that it is never iterated outside of
+// [begin, end]. This is important for two reasons:
 //
-// Arithmetic operations are allowed and the bounds of the resulting iterator
-// are not checked. Hence, it is possible to create an iterator pointing outside
-// its range, but it is not possible to dereference it.
+// 1. It allows `operator*` and `operator++` bounds checks to be `iter != end`.
+//    This is both less for the optimizer to prove, and aligns with how callers
+//    typically use iterators.
+//
+// 2. Advancing an iterator out of bounds is undefined behavior (see the table
+//    in [input.iterators]). In particular, when the underlying iterator is a
+//    pointer, it is undefined at the language level (see [expr.add]). If
+//    bounded iterators exhibited this undefined behavior, we risk compiler
+//    optimizations deleting non-redundant bounds checks.
 template <class _Iterator, class = __enable_if_t< __libcpp_is_contiguous_iterator<_Iterator>::value > >
 struct __bounded_iter {
   using value_type        = typename iterator_traits<_Iterator>::value_type;
@@ -51,8 +58,8 @@ struct __bounded_iter {
 
   // Create a singular iterator.
   //
-  // Such an iterator does not point to any object and is conceptually out of bounds, so it is
-  // not dereferenceable. Observing operations like comparison and assignment are valid.
+  // Such an iterator points past the end of an empty span, so it is not dereferenceable.
+  // Observing operations like comparison and assignment are valid.
   _LIBCPP_HIDE_FROM_ABI __bounded_iter() = default;
 
   _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter const&) = default;
@@ -70,18 +77,20 @@ struct __bounded_iter {
 
 private:
   // Create an iterator wrapping the given iterator, and whose bounds are described
-  // by the provided [begin, end) range.
+  // by the provided [begin, end] range.
   //
-  // This constructor does not check whether the resulting iterator is within its bounds.
-  // However, it does check that the provided [begin, end) range is a valid range (that
-  // is, begin <= end).
+  // The constructor does not check whether the resulting iterator is within its bounds. It is a
+  // responsibility of the container to ensure that the given bounds are valid.
   //
   // Since it is non-standard for iterators to have this constructor, __bounded_iter must
   // be created via `std::__make_bounded_iter`.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit __bounded_iter(
       _Iterator __current, _Iterator __begin, _Iterator __end)
       : __current_(__current), __begin_(__begin), __end_(__end) {
-    _LIBCPP_ASSERT_INTERNAL(__begin <= __end, "__bounded_iter(current, begin, end): [begin, end) is not a valid range");
+    _LIBCPP_ASSERT_INTERNAL(
+        __begin <= __current, "__bounded_iter(current, begin, end): current and begin are inconsistent");
+    _LIBCPP_ASSERT_INTERNAL(
+        __current <= __end, "__bounded_iter(current, begin, end): current and end are inconsistent");
   }
 
   template <class _It>
@@ -90,30 +99,37 @@ struct __bounded_iter {
 public:
   // Dereference and indexing operations.
   //
-  // These operations check that the iterator is dereferenceable, that is within [begin, end).
+  // These operations check that the iterator is dereferenceable. Since the class invariant is
+  // that the iterator is always within `[begin, end]`, we only need to check it's not pointing to
+  // `end`. This is easier for the optimizer because it aligns with the `iter != container.end()`
+  // checks that typical callers already use (see
+  // https://github.com/llvm/llvm-project/issues/78829).
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator*() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_), "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
+        __current_ != __end_, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
     return *__current_;
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pointer operator->() const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
+        __current_ != __end_, "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
     return std::__to_address(__current_);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator[](difference_type __n) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
-        __in_bounds(__current_ + __n), "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
+        __n >= __begin_ - __current_, "__bounded_iter::operator[]: Attempt to index an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n < __end_ - __current_, "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
     return __current_[__n];
   }
 
   // Arithmetic operations.
   //
-  // These operations do not check that the resulting iterator is within the bounds, since that
-  // would make it impossible to create a past-the-end iterator.
+  // These operations check that the iterator remains within `[begin, end]`.
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator++() _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __current_ != __end_, "__bounded_iter::operator++: Attempt to advance an iterator past the end");
     ++__current_;
     return *this;
   }
@@ -124,6 +140,8 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator--() _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __current_ != __begin_, "__bounded_iter::operator--: Attempt to rewind an iterator past the start");
     --__current_;
     return *this;
   }
@@ -134,6 +152,10 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator+=(difference_type __n) _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n >= __begin_ - __current_, "__bounded_iter::operator+=: Attempt to rewind an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n <= __end_ - __current_, "__bounded_iter::operator+=: Attempt to advance an iterator past the end");
     __current_ += __n;
     return *this;
   }
@@ -151,6 +173,10 @@ struct __bounded_iter {
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __bounded_iter& operator-=(difference_type __n) _NOEXCEPT {
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n <= __current_ - __begin_, "__bounded_iter::operator-=: Attempt to rewind an iterator past the start");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __n >= __current_ - __end_, "__bounded_iter::operator-=: Attempt to advance an iterator past the end");
     __current_ -= __n;
     return *this;
   }
@@ -197,15 +223,10 @@ struct __bounded_iter {
   }
 
 private:
-  // Return whether the given iterator is in the bounds of this __bounded_iter.
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool __in_bounds(_Iterator const& __iter) const {
-    return __iter >= __begin_ && __iter < __end_;
-  }
-
   template <class>
   friend struct pointer_traits;
   _Iterator __current_;       // current iterator
-  _Iterator __begin_, __end_; // valid range represented as [begin, end)
+  _Iterator __begin_, __end_; // valid range represented as [begin, end]
 };
 
 template <class _It>
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index e0dd5c5b19ace..e8584a69c1e1b 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -310,9 +310,10 @@ public:
       : __data_(__s),
         __size_(__len) {
 #if _LIBCPP_STD_VER >= 14
-    // This will result in creating an invalid `string_view` object -- some calculations involving `size` would
-    // overflow, making it effectively truncated.
-    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+    // Allocations must fit in `ptrdiff_t` for pointer arithmetic to work. If `__len` exceeds it, the input
+    // range could not have been valid. Most likely the caller underflowed some arithmetic and inadvertently
+    // passed in a negative length.
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
         __len <= static_cast<size_type>(numeric_limits<difference_type>::max()),
         "string_view::string_view(_CharT *, size_t): length does not fit in difference_type");
     _LIBCPP_ASSERT_NON_NULL(
diff --git a/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp b/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
index e5cafde467603..5990fc411e504 100644
--- a/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
+++ b/libcxx/test/libcxx/atomics/atomics.align/align.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: c++03
-// REQUIRES: has-128-bit-atomics
+// REQUIRES: has-1024-bit-atomics
 // ADDITIONAL_COMPILE_FLAGS: -Wno-psabi
 // ... since C++20 std::__atomic_base initializes, so we get a warning about an
 // ABI change for vector variants since the constructor code for that is
diff --git a/libcxx/test/libcxx/clang_modules_include.gen.py b/libcxx/test/libcxx/clang_modules_include.gen.py
index e3593eefad2fe..61a9258237640 100644
--- a/libcxx/test/libcxx/clang_modules_include.gen.py
+++ b/libcxx/test/libcxx/clang_modules_include.gen.py
@@ -47,11 +47,8 @@
 #include <{header}>
 """)
 
-# TODO: Remove the UNSUPPORTED{BLOCKLIT}: clang-modules-build once issues with this test have been figured out.
 print(f"""\
 //--- __std_clang_module.compile.pass.mm
-// UNSUPPORTED{BLOCKLIT}: clang-modules-build
-
 // RUN{BLOCKLIT}: %{{cxx}} %s %{{flags}} %{{compile_flags}} -fmodules -fcxx-modules -fmodules-cache-path=%t -fsyntax-only
 
 // REQUIRES{BLOCKLIT}: clang-modules-build
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..f57341d64ff39
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..3f4d1c2d3bdbb
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.iterator.increment.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..8b47f54895560
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..8f8305833e077
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_map<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_map<int,
+                               std::string,
+                               std::hash<int>,
+                               std::equal_to<int>,
+                               min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c({{42, std::string()}});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 5ea7f4d97fcc1..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp
deleted file mode 100644
index 2ed09bc81aaa9..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 2e4e62dbb41f4..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 28599263447a5..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.map/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_map<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_map<int, std::string, std::hash<int>, std::equal_to<int>,
-                                   min_allocator<std::pair<const int, std::string>>> C;
-        C c({{42, std::string()}});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..d295a82a8a1f5
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <string>
+#include <unordered_map>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..4247edc8def97
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.iterator.increment.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c;
+    c.insert(std::make_pair(1, "one"));
+    C::iterator i = c.begin();
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
+    C::const_iterator i2 = c.cbegin();
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..7ea87964e05f0
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..ffa3fec0ca1f1
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_map>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_map>
+#include <cassert>
+#include <string>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef std::unordered_multimap<int, std::string> C;
+    C c;
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef std::unordered_multimap<int,
+                                    std::string,
+                                    std::hash<int>,
+                                    std::equal_to<int>,
+                                    min_allocator<std::pair<const int, std::string>>>
+        C;
+    C c({{1, std::string()}});
+    c.insert(std::make_pair(42, std::string()));
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 3dad48b3925d1..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <string>
-#include <unordered_map>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp
deleted file mode 100644
index b02bac6022f7c..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                            min_allocator<std::pair<const int, std::string>>> C;
-        C c;
-        c.insert(std::make_pair(1, "one"));
-        C::iterator i = c.begin();
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 9719ba5889759..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 2f74a191e8acd..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_map>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_map>
-#include <cassert>
-#include <string>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef std::unordered_multimap<int, std::string> C;
-        C c;
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    {
-        typedef std::unordered_multimap<int, std::string, std::hash<int>, std::equal_to<int>,
-                                        min_allocator<std::pair<const int, std::string>>> C;
-        C c({{1, std::string()}});
-        c.insert(std::make_pair(42, std::string()));
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..31edd6099c965
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..0e0e4aab303cd
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.iterator.increment.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c;
+    c.insert(42);
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..fe833c40bc351
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..142c07f83c066
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T> C;
+    C c;
+    c.insert(42);
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 51cb9a6bff643..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp
deleted file mode 100644
index 17b8c77aadd1d..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c;
-        c.insert(42);
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 24102a47802fd..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 3f70ba2971581..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T> C;
-        C c;
-        c.insert(42);
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i,
-                                "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_multiset<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i,
-                                "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..8464601f61046
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.dereference.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::iterator i = c.end();
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+    C::const_iterator i2 = c.cend();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..29446880900bc
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.iterator.increment.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c;
+    c.insert(42);
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::iterator i = c.begin();
+    assert(i != c.end());
+    ++i;
+    assert(i == c.end());
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
+    C::const_iterator i2 = c.cbegin();
+    assert(i2 != c.cend());
+    ++i2;
+    assert(i2 == c.cend());
+    TEST_LIBCPP_ASSERT_FAILURE(++i2, "Attempted to increment a non-incrementable unordered container const_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp
new file mode 100644
index 0000000000000..7163e3735cee0
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.dereference.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Dereference non-dereferenceable iterator.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c(1);
+    C::local_iterator i = c.end(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cend(0);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *i2, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp
new file mode 100644
index 0000000000000..c9fe5afd09702
--- /dev/null
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.local_iterator.increment.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <unordered_set>
+
+// Increment local_iterator past end.
+
+// REQUIRES: has-unix-headers, libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: c++03
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <unordered_set>
+#include <cassert>
+
+#include "check_assertion.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    typedef int T;
+    typedef std::unordered_set<T> C;
+    C c;
+    c.insert(42);
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  {
+    typedef int T;
+    typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
+    C c({42});
+    C::size_type b      = c.bucket(42);
+    C::local_iterator i = c.begin(b);
+    assert(i != c.end(b));
+    ++i;
+    assert(i == c.end(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+    C::const_local_iterator i2 = c.cbegin(b);
+    assert(i2 != c.cend(b));
+    ++i2;
+    assert(i2 == c.cend(b));
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ++i2, "Attempted to increment a non-incrementable unordered container const_local_iterator");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp
deleted file mode 100644
index 49663b4f824ae..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::iterator i = c.end();
-        TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp
deleted file mode 100644
index da3fbdc5a6e8d..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.iterator.increment.pass.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c;
-        c.insert(42);
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::iterator i = c.begin();
-        assert(i != c.end());
-        ++i;
-        assert(i == c.end());
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp
deleted file mode 100644
index 912edc2e4bf47..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.dereference.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Dereference non-dereferenceable iterator.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c(1);
-        C::local_iterator i = c.end(0);
-        TEST_LIBCPP_ASSERT_FAILURE(
-            *i, "Attempted to dereference a non-dereferenceable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp
deleted file mode 100644
index 42a62aed472ca..0000000000000
--- a/libcxx/test/libcxx/containers/unord/unord.set/debug.local_iterator.increment.pass.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <unordered_set>
-
-// Increment local_iterator past end.
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: !libcpp-has-legacy-debug-mode, c++03
-
-#include <unordered_set>
-#include <cassert>
-
-#include "check_assertion.h"
-#include "min_allocator.h"
-
-int main(int, char**) {
-    {
-        typedef int T;
-        typedef std::unordered_set<T> C;
-        C c;
-        c.insert(42);
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    {
-        typedef int T;
-        typedef std::unordered_set<T, std::hash<T>, std::equal_to<T>, min_allocator<T>> C;
-        C c({42});
-        C::size_type b = c.bucket(42);
-        C::local_iterator i = c.begin(b);
-        assert(i != c.end(b));
-        ++i;
-        assert(i == c.end(b));
-        TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-incrementable unordered container const_local_iterator");
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp
new file mode 100644
index 0000000000000..d4dacb1f2f1c7
--- /dev/null
+++ b/libcxx/test/libcxx/containers/views/views.span/assert.iterator-indexing.pass.cpp
@@ -0,0 +1,174 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// Make sure that std::span's iterators check for OOB accesses when the debug mode is enabled.
+
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=none
+
+#include <span>
+
+#include "check_assertion.h"
+
+struct Foo {
+  int x;
+};
+
+template <typename Iter>
+void test_iterator(Iter begin, Iter end, bool reverse) {
+  std::ptrdiff_t distance = std::distance(begin, end);
+
+  // Dereferencing an iterator at the end.
+  {
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *end,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+#if _LIBCPP_STD_VER >= 20
+    // In C++20 mode, std::reverse_iterator implements operator->, but not operator*, with
+    // std::prev instead of operator--. std::prev ultimately calls operator+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end->x,
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#else
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end->x,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#endif
+  }
+
+  // Incrementing an iterator past the end.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator++: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it++, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(++it, msg);
+  }
+
+  // Decrementing an iterator past the start.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator++: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator--: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it--, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(--it, msg);
+  }
+
+  // Advancing past the end with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator+=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + 1, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (distance + 1), msg);
+  }
+
+  // Advancing past the end with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator-=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (-1), msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator+=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (-1), msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator-=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - 1, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (distance + 1), msg);
+  }
+
+  // Out-of-bounds operator[].
+  {
+    [[maybe_unused]] const char* end_msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_end_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_start_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator[]: Attempt to index an iterator past the start";
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance + 1], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-1], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-99], past_start_msg);
+
+    auto it = begin + 1;
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance - 1], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-2], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-99], past_start_msg);
+  }
+}
+
+int main(int, char**) {
+  // span<T>::iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo> const span(array, 3);
+    test_iterator(span.begin(), span.end(), /*reverse=*/false);
+  }
+
+  // span<T, N>::iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo, 3> const span(array, 3);
+    test_iterator(span.begin(), span.end(), /*reverse=*/false);
+  }
+
+  // span<T>::reverse_iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo> const span(array, 3);
+    test_iterator(span.rbegin(), span.rend(), /*reverse=*/true);
+  }
+
+  // span<T, N>::reverse_iterator
+  {
+    Foo array[] = {{0}, {1}, {2}};
+    std::span<Foo, 3> const span(array, 3);
+    test_iterator(span.rbegin(), span.rend(), /*reverse=*/true);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
deleted file mode 100644
index 360e7a981a0df..0000000000000
--- a/libcxx/test/libcxx/containers/views/views.span/debug.iterator-indexing.pass.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// Make sure that std::span's iterators check for OOB accesses when the debug mode is enabled.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
-// UNSUPPORTED: libcpp-hardening-mode=none
-
-#include <span>
-
-#include "check_assertion.h"
-
-struct Foo {
-    int x;
-};
-
-int main(int, char**) {
-    // span<T>::iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo> const span(array, 3);
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.begin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-        }
-    }
-
-    // span<T, N>::iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo, 3> const span(array, 3);
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.end();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.begin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-        }
-    }
-
-    // span<T>::reverse_iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo> const span(array, 3);
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rbegin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-    }
-
-    // span<T, N>::reverse_iterator
-    {
-        Foo array[] = {{0}, {1}, {2}};
-        std::span<Foo, 3> const span(array, 3);
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rend();
-            TEST_LIBCPP_ASSERT_FAILURE(it->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-        }
-        {
-            auto it = span.rbegin();
-            TEST_LIBCPP_ASSERT_FAILURE(it[3], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-        }
-    }
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
index 8eee4ad2f319a..bf723f14e80a9 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
@@ -58,15 +58,15 @@ void test_death() {
   std::__bounded_iter<Iter> const oob  = std::__make_bounded_iter(Iter(e), Iter(b), Iter(e));
 
   // operator*
-  TEST_LIBCPP_ASSERT_FAILURE(*oob, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(*oob, "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
   // operator->
-  TEST_LIBCPP_ASSERT_FAILURE(oob->x, "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
+  TEST_LIBCPP_ASSERT_FAILURE(oob->x, "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
   // operator[]
-  TEST_LIBCPP_ASSERT_FAILURE(iter[-1], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(iter[5], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[0], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[1], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-  TEST_LIBCPP_ASSERT_FAILURE(oob[-6], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
+  TEST_LIBCPP_ASSERT_FAILURE(iter[-1], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
+  TEST_LIBCPP_ASSERT_FAILURE(iter[5], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[0], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[1], "__bounded_iter::operator[]: Attempt to index an iterator at or past the end");
+  TEST_LIBCPP_ASSERT_FAILURE(oob[-6], "__bounded_iter::operator[]: Attempt to index an iterator past the start");
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp b/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp
new file mode 100644
index 0000000000000..5043a88cbc3da
--- /dev/null
+++ b/libcxx/test/libcxx/strings/string.view/string.view.iterators/assert.iterator-indexing.pass.cpp
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure that std::string_view's iterators check for OOB accesses when the debug mode is enabled.
+
+// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
+// UNSUPPORTED: libcpp-hardening-mode=none
+
+#include <iterator>
+#include <string_view>
+
+#include "check_assertion.h"
+
+template <typename Iter>
+void test_iterator(Iter begin, Iter end, bool reverse) {
+  ptrdiff_t distance = std::distance(begin, end);
+
+  // Dereferencing an iterator at the end.
+  {
+    TEST_LIBCPP_ASSERT_FAILURE(
+        *end,
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator*: Attempt to dereference an iterator at the end");
+#if _LIBCPP_STD_VER >= 20
+    // In C++20 mode, std::reverse_iterator implements operator->, but not operator*, with
+    // std::prev instead of operator--. std::prev ultimately calls operator+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end.operator->(),
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#else
+    TEST_LIBCPP_ASSERT_FAILURE(
+        end.operator->(),
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator->: Attempt to dereference an iterator at the end");
+#endif
+  }
+
+  // Incrementing an iterator past the end.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator++: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it++, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(++it, msg);
+  }
+
+  // Decrementing an iterator past the start.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator++: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator--: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it--, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(--it, msg);
+  }
+
+  // Advancing past the end with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator+=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + 1, msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (distance + 1), msg);
+  }
+
+  // Advancing past the end with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator-=: Attempt to advance an iterator past the end";
+    auto it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (-1), msg);
+    it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator+= and operator+.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator+=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin + (-1), msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it += (-distance - 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end + (-distance - 1), msg);
+  }
+
+  // Rewinding past the start with operator-= and operator-.
+  {
+    [[maybe_unused]] const char* msg =
+        reverse ? "__bounded_iter::operator+=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator-=: Attempt to rewind an iterator past the start";
+    auto it = begin;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= 1, msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin - 1, msg);
+    it = end;
+    TEST_LIBCPP_ASSERT_FAILURE(it -= (distance + 1), msg);
+    TEST_LIBCPP_ASSERT_FAILURE(end - (distance + 1), msg);
+  }
+
+  // Out-of-bounds operator[].
+  {
+    [[maybe_unused]] const char* end_msg =
+        reverse ? "__bounded_iter::operator--: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_end_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to rewind an iterator past the start"
+                : "__bounded_iter::operator[]: Attempt to index an iterator at or past the end";
+    [[maybe_unused]] const char* past_start_msg =
+        reverse ? "__bounded_iter::operator-=: Attempt to advance an iterator past the end"
+                : "__bounded_iter::operator[]: Attempt to index an iterator past the start";
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[distance + 1], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-1], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(begin[-99], past_start_msg);
+
+    auto it = begin + 1;
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance - 1], end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[distance], past_end_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-2], past_start_msg);
+    TEST_LIBCPP_ASSERT_FAILURE(it[-99], past_start_msg);
+  }
+}
+
+int main(int, char**) {
+  std::string_view const str("hello world");
+
+  // string_view::iterator
+  test_iterator(str.begin(), str.end(), /*reverse=*/false);
+
+  // string_view::const_iterator
+  test_iterator(str.cbegin(), str.cend(), /*reverse=*/false);
+
+  // string_view::reverse_iterator
+  test_iterator(str.rbegin(), str.rend(), /*reverse=*/true);
+
+  // string_view::const_reverse_iterator
+  test_iterator(str.crbegin(), str.crend(), /*reverse=*/true);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp b/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
deleted file mode 100644
index 5064319a0aee1..0000000000000
--- a/libcxx/test/libcxx/strings/string.view/string.view.iterators/debug.iterator-indexing.pass.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Make sure that std::string_view's iterators check for OOB accesses when the debug mode is enabled.
-
-// REQUIRES: has-unix-headers, libcpp-has-abi-bounded-iterators
-// UNSUPPORTED: libcpp-hardening-mode=none
-
-#include <string_view>
-
-#include "check_assertion.h"
-
-int main(int, char**) {
-  // string_view::iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.end();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.end();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.begin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-    }
-  }
-
-  // string_view::const_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.cend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.cend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.cbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator[]: Attempt to index an iterator out-of-range");
-    }
-  }
-
-  // string_view::reverse_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.rend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.rend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.rbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-  }
-
-  // string_view::const_reverse_iterator
-  {
-    std::string_view const str("hello world");
-    {
-      auto it = str.crend();
-      TEST_LIBCPP_ASSERT_FAILURE(*it, "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.crend();
-      TEST_LIBCPP_ASSERT_FAILURE(
-          it.operator->(), "__bounded_iter::operator->: Attempt to dereference an out-of-range iterator");
-    }
-    {
-      auto it = str.crbegin();
-      TEST_LIBCPP_ASSERT_FAILURE(it[99], "__bounded_iter::operator*: Attempt to dereference an out-of-range iterator");
-    }
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
index 1f0f61ed3e6ea..73c74fc6589f9 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
index 0b6fcacb3d66d..8d7803a15cd55 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_strong_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
index 5de2f519ea435..6c1aa06bd2619 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
index fc0ad8a10acd1..b00940684907a 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_compare_exchange_weak_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
index 31cd316e023a3..6ebe64087fa0a 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
index 834a811c64342..3a505c8aa1569 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_exchange_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
index 4eced1d2b7f37..88775cf32d56f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_init.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
index 1a3b8393d8f9f..11a6b002a7865 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_is_lock_free.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
index 5bb2bb2b614f9..36f1ee3ba370f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
index ecb27a261eb65..476e268c971c1 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_load_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
index 25a845e9e1f8f..94c570fb5d962 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
index d22657237327f..6d1acebe615f5 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.req/atomic_store_explicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // <atomic>
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 93ed607d413b2..2b9f34b731f87 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
index ad48ef1441f47..dfa781c566009 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
index 449e50fa12b5f..38142b336e72c 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
index a6ee4fc632797..2db95a0b67a7f 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
@@ -8,7 +8,7 @@
 //
 // UNSUPPORTED: no-threads
 // XFAIL: c++03
-// XFAIL: !has-128-bit-atomics
+// XFAIL: !has-1024-bit-atomics
 
 // XFAIL: availability-synchronization_library-missing
 
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
index 5920d62e0e5a6..7ce4bed9c7db8 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_conversion.pass.cpp
@@ -9,10 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // XFAIL: target=powerpc{{.*}}le-unknown-linux-gnu
 
-// TODO: This test makes incorrect assumptions about floating point conversions.
-//       See https://github.com/llvm/llvm-project/issues/74327.
-// XFAIL: optimization=speed
-
 // <experimental/simd>
 //
 // [simd.class]
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 3f0dc0c50a0d0..4fd8798b794a1 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -171,12 +171,12 @@ def _getAndroidDeviceApi(cfg):
         ),
     ),
     Feature(
-        name="has-128-bit-atomics",
+        name="has-1024-bit-atomics",
         when=lambda cfg: sourceBuilds(
             cfg,
             """
             #include <atomic>
-            struct Large { char storage[128/8]; };
+            struct Large { int storage[1024/8]; };
             std::atomic<Large> x;
             int main(int, char**) { (void)x.load(); (void)x.is_lock_free(); return 0; }
           """,
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index d0b74ac445499..5f00eaded76d3 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -172,7 +172,7 @@ binImports(COFFLinkerContext &ctx,
 // A chunk for the delay import descriptor table etnry.
 class DelayDirectoryChunk : public NonSectionChunk {
 public:
-  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) {}
+  explicit DelayDirectoryChunk(Chunk *n) : dllName(n) { setAlignment(4); }
 
   size_t getSize() const override {
     return sizeof(delay_import_directory_table_entry);
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 4798c86f7d1b6..20de1b9b7bde9 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -1074,12 +1074,12 @@ static void mergeArch(RISCVISAInfo::OrderedExtensionMap &mergedExts,
     mergedXlen = info.getXLen();
   } else {
     for (const auto &ext : info.getExtensions()) {
-      if (auto it = mergedExts.find(ext.first); it != mergedExts.end()) {
-        if (std::tie(it->second.Major, it->second.Minor) >=
+      auto p = mergedExts.insert(ext);
+      if (!p.second) {
+        if (std::tie(p.first->second.Major, p.first->second.Minor) <
             std::tie(ext.second.Major, ext.second.Minor))
-          continue;
+          p.first->second = ext.second;
       }
-      mergedExts[ext.first] = ext.second;
     }
   }
 }
diff --git a/lld/test/COFF/delayimports-armnt.yaml b/lld/test/COFF/delayimports-armnt.yaml
index 7d9bc38c5c360..ea96d864ef53d 100644
--- a/lld/test/COFF/delayimports-armnt.yaml
+++ b/lld/test/COFF/delayimports-armnt.yaml
@@ -6,6 +6,7 @@
 # RUN: llvm-readobj --coff-imports %t.exe | FileCheck -check-prefix=IMPORT %s
 # RUN: llvm-readobj --coff-basereloc %t.exe | FileCheck -check-prefix=BASEREL %s
 # RUN: llvm-objdump --no-print-imm-hex -d %t.exe | FileCheck --check-prefix=DISASM %s
+# RUN: llvm-readobj --file-headers %t.exe | FileCheck -check-prefix=DIR %s
 
 # IMPORT:      Format: COFF-ARM
 # IMPORT-NEXT: Arch: thumb
@@ -13,9 +14,9 @@
 # IMPORT-NEXT: DelayImport {
 # IMPORT-NEXT:   Name: library.dll
 # IMPORT-NEXT:   Attributes: 0x1
-# IMPORT-NEXT:   ModuleHandle: 0x3000
-# IMPORT-NEXT:   ImportAddressTable: 0x3008
-# IMPORT-NEXT:   ImportNameTable: 0x2040
+# IMPORT-NEXT:   ModuleHandle: 0x3008
+# IMPORT-NEXT:   ImportAddressTable: 0x3010
+# IMPORT-NEXT:   ImportNameTable: 0x2044
 # IMPORT-NEXT:   BoundDelayImportTable: 0x0
 # IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 # IMPORT-NEXT:   Import {
@@ -43,7 +44,7 @@
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: HIGHLOW
-# BASEREL-NEXT:     Address: 0x3008
+# BASEREL-NEXT:     Address: 0x3010
 # BASEREL-NEXT:   }
 # BASEREL-NEXT:   Entry {
 # BASEREL-NEXT:     Type: ABSOLUTE
@@ -52,20 +53,24 @@
 # BASEREL-NEXT: ]
 #
 # DISASM:    00401000 <.text>:
-# DISASM:      40100c:       f243 0c08       movw r12, #12296
+# DISASM:      40100c:       f243 0c10       movw r12, #12304
 # DISASM-NEXT:               f2c0 0c40       movt    r12, #64
 # DISASM-NEXT:               f000 b800       b.w     {{.+}} @ imm = #0
 # DISASM-NEXT:               e92d 480f       push.w  {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               f20d 0b10       addw    r11, sp, #16
 # DISASM-NEXT:               ed2d 0b10       vpush   {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               4661            mov     r1, r12
-# DISASM-NEXT:               f242 0000       movw r0, #8192
+# DISASM-NEXT:               f242 0004       movw r0, #8196
 # DISASM-NEXT:               f2c0 0040       movt    r0, #64
 # DISASM-NEXT:               f7ff ffe7       bl      0x401000 <.text>
 # DISASM-NEXT:               4684            mov     r12, r0
 # DISASM-NEXT:               ecbd 0b10       vpop    {d0, d1, d2, d3, d4, d5, d6, d7}
 # DISASM-NEXT:               e8bd 480f       pop.w   {r0, r1, r2, r3, r11, lr}
 # DISASM-NEXT:               4760            bx      r12
+#
+# DIR:         DelayImportDescriptorRVA: 0x2004
+# DIR-NEXT:    DelayImportDescriptorSize: 0x40
+
 
 --- !COFF
 header:
@@ -80,6 +85,14 @@ sections:
       - VirtualAddress:  0
         SymbolName:      __imp_function
         Type:            IMAGE_REL_ARM_MOV32T
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    Alignment:       1
+    SectionData:     01
+  - Name:            .data
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    Alignment:       1
+    SectionData:     02
 symbols:
   - Name:            .text
     Value:           0
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 995273a97b653..09d3d15a94083 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -331,7 +331,7 @@ macOS
 ^^^^^
 
 On macOS the LLDB test suite requires libc++. Either add
-``LLVM_ENABLE_RUNTIMES="libcxx;libcxxabi"`` or disable the test suite with
+``LLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"`` or disable the test suite with
 ``LLDB_INCLUDE_TESTS=OFF``. Further useful options:
 
 * ``LLDB_BUILD_FRAMEWORK:BOOL``: Builds the LLDB.framework.
@@ -370,7 +370,7 @@ LLVM <https://llvm.org/docs/BuildingADistribution.html>`_):
   $ cmake -B /path/to/lldb-build -G Ninja \
           -C /path/to/llvm-project/lldb/cmake/caches/Apple-lldb-macOS.cmake \
           -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
           llvm-project/llvm
 
   $ DESTDIR=/path/to/lldb-install ninja -C /path/to/lldb-build check-lldb install-distribution
@@ -386,7 +386,7 @@ Build LLDB standalone for development with Xcode:
   $ cmake -B /path/to/llvm-build -G Ninja \
           -C /path/to/llvm-project/lldb/cmake/caches/Apple-lldb-base.cmake \
           -DLLVM_ENABLE_PROJECTS="clang" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
           llvm-project/llvm
   $ ninja -C /path/to/llvm-build
 
diff --git a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
index fe46321c3186c..32b101a2f9843 100644
--- a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
+++ b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
@@ -127,7 +127,7 @@ class ValueObjectPrinter {
   void PrintChild(lldb::ValueObjectSP child_sp,
                   const DumpValueObjectOptions::PointerDepth &curr_ptr_depth);
 
-  uint32_t GetMaxNumChildrenToPrint(bool &print_dotdotdot);
+  llvm::Expected<uint32_t> GetMaxNumChildrenToPrint(bool &print_dotdotdot);
 
   void
   PrintChildren(bool value_printed, bool summary_printed,
diff --git a/lldb/source/Core/DumpDataExtractor.cpp b/lldb/source/Core/DumpDataExtractor.cpp
index 986c9a181919e..826edd7bab046 100644
--- a/lldb/source/Core/DumpDataExtractor.cpp
+++ b/lldb/source/Core/DumpDataExtractor.cpp
@@ -150,8 +150,8 @@ static lldb::offset_t DumpInstructions(const DataExtractor &DE, Stream *s,
       if (bytes_consumed) {
         offset += bytes_consumed;
         const bool show_address = base_addr != LLDB_INVALID_ADDRESS;
-        const bool show_bytes = true;
-        const bool show_control_flow_kind = true;
+        const bool show_bytes = false;
+        const bool show_control_flow_kind = false;
         ExecutionContext exe_ctx;
         exe_scope->CalculateExecutionContext(exe_ctx);
         disassembler_sp->GetInstructionList().Dump(
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index d813044d02ff5..f39bd07a25536 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -2744,8 +2744,19 @@ ValueObjectSP ValueObject::DoCast(const CompilerType &compiler_type) {
 
 ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) {
   // Only allow casts if the original type is equal or larger than the cast
-  // type.  We don't know how to fetch more data for all the ConstResult types,
-  // so we can't guarantee this will work:
+  // type, unless we know this is a load address.  Getting the size wrong for
+  // a host side storage could leak lldb memory, so we absolutely want to 
+  // prevent that.  We may not always get the right value, for instance if we
+  // have an expression result value that's copied into a storage location in
+  // the target may not have copied enough memory.  I'm not trying to fix that
+  // here, I'm just making Cast from a smaller to a larger possible in all the
+  // cases where that doesn't risk making a Value out of random lldb memory.
+  // You have to check the ValueObject's Value for the address types, since
+  // ValueObjects that use live addresses will tell you they fetch data from the
+  // live address, but once they are made, they actually don't.
+  // FIXME: Can we make ValueObject's with a live address fetch "more data" from
+  // the live address if it is still valid?
+
   Status error;
   CompilerType my_type = GetCompilerType();
 
@@ -2753,9 +2764,10 @@ ValueObjectSP ValueObject::Cast(const CompilerType &compiler_type) {
       = ExecutionContext(GetExecutionContextRef())
           .GetBestExecutionContextScope();
   if (compiler_type.GetByteSize(exe_scope)
-      <= GetCompilerType().GetByteSize(exe_scope)) {
+      <= GetCompilerType().GetByteSize(exe_scope) 
+      || m_value.GetValueType() == Value::ValueType::LoadAddress)
         return DoCast(compiler_type);
-  }
+
   error.SetErrorString("Can only cast to a type that is equal to or smaller "
                        "than the orignal type.");
 
diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp
index fb29c22c0ab5a..67d71c90a959d 100644
--- a/lldb/source/Core/ValueObjectVariable.cpp
+++ b/lldb/source/Core/ValueObjectVariable.cpp
@@ -99,7 +99,8 @@ ValueObjectVariable::CalculateNumChildren(uint32_t max) {
   CompilerType type(GetCompilerType());
 
   if (!type.IsValid())
-    return 0;
+    return llvm::make_error<llvm::StringError>("invalid type",
+                                               llvm::inconvertibleErrorCode());
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
   const bool omit_empty_base_classes = true;
diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
index b853199e878c9..bbdc2a9981570 100644
--- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp
+++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp
@@ -621,13 +621,17 @@ void ValueObjectPrinter::PrintChild(
   }
 }
 
-uint32_t ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
+llvm::Expected<uint32_t>
+ValueObjectPrinter::GetMaxNumChildrenToPrint(bool &print_dotdotdot) {
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   if (m_options.m_pointer_as_array)
     return m_options.m_pointer_as_array.m_element_count;
 
-  uint32_t num_children = synth_valobj.GetNumChildrenIgnoringErrors();
+  auto num_children_or_err = synth_valobj.GetNumChildren();
+  if (!num_children_or_err)
+    return num_children_or_err;
+  uint32_t num_children = *num_children_or_err;
   print_dotdotdot = false;
   if (num_children) {
     const size_t max_num_children = GetMostSpecializedValue()
@@ -704,7 +708,12 @@ void ValueObjectPrinter::PrintChildren(
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
-  size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
+  auto num_children_or_err = GetMaxNumChildrenToPrint(print_dotdotdot);
+  if (!num_children_or_err) {
+    *m_stream << " <" << llvm::toString(num_children_or_err.takeError()) << '>';
+    return;
+  }
+  uint32_t num_children = *num_children_or_err;
   if (num_children) {
     bool any_children_printed = false;
 
@@ -753,7 +762,12 @@ bool ValueObjectPrinter::PrintChildrenOneLiner(bool hide_names) {
   ValueObject &synth_valobj = GetValueObjectForChildrenGeneration();
 
   bool print_dotdotdot = false;
-  size_t num_children = GetMaxNumChildrenToPrint(print_dotdotdot);
+  auto num_children_or_err = GetMaxNumChildrenToPrint(print_dotdotdot);
+  if (!num_children_or_err) {
+    *m_stream << '<' << llvm::toString(num_children_or_err.takeError()) << '>';
+    return true;
+  }
+  uint32_t num_children = *num_children_or_err;
 
   if (num_children) {
     m_stream->PutChar('(');
diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index 0682746e448e3..e4e131d70d431 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -201,7 +201,7 @@ Status IRExecutionUnit::DisassembleFunction(Stream &stream,
                                       UINT32_MAX, false, false);
 
   InstructionList &instruction_list = disassembler_sp->GetInstructionList();
-  instruction_list.Dump(&stream, true, true, /*show_control_flow_kind=*/true,
+  instruction_list.Dump(&stream, true, true, /*show_control_flow_kind=*/false,
                         &exe_ctx);
 
   return ret;
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index e66271e8a6ee9..ed61aecc23b9b 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1597,6 +1597,7 @@ bool Editline::GetLines(int first_line_number, StringList &lines,
 void Editline::PrintAsync(Stream *stream, const char *s, size_t len) {
   std::lock_guard<std::recursive_mutex> guard(m_output_mutex);
   if (m_editor_status == EditorStatus::Editing) {
+    SaveEditedLine();
     MoveCursor(CursorLocation::EditingCursor, CursorLocation::BlockStart);
     fprintf(m_output_file, ANSI_CLEAR_BELOW);
   }
diff --git a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
index d84cd36d7ce17..cd91f4a6ff1bc 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/ASanLibsanitizers/InstrumentationRuntimeASanLibsanitizers.cpp
@@ -90,9 +90,16 @@ void InstrumentationRuntimeASanLibsanitizers::Activate() {
   if (!process_sp)
     return;
 
+  lldb::ModuleSP module_sp = GetRuntimeModuleSP();
+
   Breakpoint *breakpoint = ReportRetriever::SetupBreakpoint(
-      GetRuntimeModuleSP(), process_sp,
-      ConstString("_Z22raise_sanitizers_error23sanitizer_error_context"));
+      module_sp, process_sp, ConstString("sanitizers_address_on_report"));
+
+  if (!breakpoint) {
+    breakpoint = ReportRetriever::SetupBreakpoint(
+        module_sp, process_sp,
+        ConstString("_Z22raise_sanitizers_error23sanitizer_error_context"));
+  }
 
   if (!breakpoint)
     return;
diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
index ff58c4cababae..298b63bc716fc 100644
--- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
+++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp
@@ -219,6 +219,7 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp,
   return true; // Return true to stop the target
 }
 
+// FIXME: Setup the breakpoint using a less fragile SPI. rdar://124399066
 Breakpoint *ReportRetriever::SetupBreakpoint(ModuleSP module_sp,
                                              ProcessSP process_sp,
                                              ConstString symbol_name) {
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index c02b08cb47828..68d9165b90a47 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5268,7 +5268,8 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
                                 bool omit_empty_base_classes,
                                 const ExecutionContext *exe_ctx) {
   if (!type)
-    return 0;
+    return llvm::make_error<llvm::StringError>("invalid clang type",
+                                               llvm::inconvertibleErrorCode());
 
   uint32_t num_children = 0;
   clang::QualType qual_type(RemoveWrappingTypes(GetQualType(type)));
@@ -5325,9 +5326,11 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       }
       num_children += std::distance(record_decl->field_begin(),
                                record_decl->field_end());
-    }
+    } else
+      return llvm::make_error<llvm::StringError>(
+          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"",
+          llvm::inconvertibleErrorCode());
     break;
-
   case clang::Type::ObjCObject:
   case clang::Type::ObjCInterface:
     if (GetCompleteQualType(&getASTContext(), qual_type)) {
diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 7ff5cd2c23b07..c4a171ec7d01b 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -83,7 +83,7 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
       const uint32_t addr_byte_size = m_arch.GetAddressByteSize();
       const bool show_address = true;
       const bool show_bytes = true;
-      const bool show_control_flow_kind = true;
+      const bool show_control_flow_kind = false;
       m_cfa_reg_info = *m_inst_emulator_up->GetRegisterInfo(
           unwind_plan.GetRegisterKind(), unwind_plan.GetInitialCFARegister());
       m_fp_is_cfa = false;
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 85dd2d841a5a0..8e4c3c761f784 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -777,7 +777,8 @@ CompilerType::GetNumChildren(bool omit_empty_base_classes,
     if (auto type_system_sp = GetTypeSystem())
       return type_system_sp->GetNumChildren(m_type, omit_empty_base_classes,
                                        exe_ctx);
-  return 0;
+  return llvm::make_error<llvm::StringError>("invalid type",
+                                             llvm::inconvertibleErrorCode());
 }
 
 lldb::BasicType CompilerType::GetBasicTypeEnumeration() const {
diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index c63bbe94fece0..5b5bf5c3f6f8c 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -125,7 +125,7 @@ void Symtab::Dump(Stream *s, Target *target, SortOrder sort_order,
 
       std::multimap<llvm::StringRef, const Symbol *> name_map;
       for (const Symbol &symbol : m_symbols)
-        name_map.emplace(llvm::StringRef(symbol.GetName()), &symbol);
+        name_map.emplace(symbol.GetName().GetStringRef(), &symbol);
 
       for (const auto &name_to_symbol : name_map) {
         const Symbol *symbol = name_to_symbol.second;
diff --git a/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py b/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
index 8c3bdabeaac1b..4d6f44db0195b 100644
--- a/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
+++ b/lldb/test/API/functionalities/data-formatter/builtin-formats/TestBuiltinFormats.py
@@ -308,5 +308,5 @@ def test_pointer(self):
     @no_debug_info_test
     def test_instruction(self):
         self.assertIn(
-            "  addq   0xa(%rdi), %r8\n", self.getFormatted("instruction", "0x0a47034c")
+            "= addq   0xa(%rdi), %r8\n", self.getFormatted("instruction", "0x0a47034c")
         )
diff --git a/lldb/test/API/functionalities/valobj_errors/Makefile b/lldb/test/API/functionalities/valobj_errors/Makefile
new file mode 100644
index 0000000000000..d2c966a71411b
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/Makefile
@@ -0,0 +1,9 @@
+C_SOURCES := main.c
+LD_EXTRAS = hidden.o
+
+a.out: hidden.o
+
+hidden.o: hidden.c
+	$(CC) -g0 -c -o $@ $<
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py b/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py
new file mode 100644
index 0000000000000..8a114005c493b
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/TestValueObjectErrors.py
@@ -0,0 +1,14 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class ValueObjectErrorsTestCase(TestBase):
+    def test(self):
+        """Test that the error message for a missing type
+        is visible when printing an object"""
+        self.build()
+        lldbutil.run_to_source_breakpoint(self, "break here",
+                                          lldb.SBFileSpec('main.c'))
+        self.expect('v -ptr-depth 1 x', substrs=['<incomplete type "Opaque">'])
diff --git a/lldb/test/API/functionalities/valobj_errors/hidden.c b/lldb/test/API/functionalities/valobj_errors/hidden.c
new file mode 100644
index 0000000000000..d3b93ce1ab9cf
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/hidden.c
@@ -0,0 +1,4 @@
+struct Opaque {
+  int i, j, k;
+} *global;
+struct Opaque *getOpaque() { return &global; }
diff --git a/lldb/test/API/functionalities/valobj_errors/main.c b/lldb/test/API/functionalities/valobj_errors/main.c
new file mode 100644
index 0000000000000..fabdca9d3a2ec
--- /dev/null
+++ b/lldb/test/API/functionalities/valobj_errors/main.c
@@ -0,0 +1,9 @@
+struct Opaque;
+struct Opaque *getOpaque();
+void puts(const char *);
+
+int main() {
+  struct Opaque *x = getOpaque();
+  puts("break here\n");
+  return (int)x;
+}
diff --git a/lldb/test/API/python_api/value/TestValueAPI.py b/lldb/test/API/python_api/value/TestValueAPI.py
index 18376f76e3c85..512100912d6fe 100644
--- a/lldb/test/API/python_api/value/TestValueAPI.py
+++ b/lldb/test/API/python_api/value/TestValueAPI.py
@@ -148,14 +148,66 @@ def test(self):
 
         # Test some other cases of the Cast API.  We allow casts from one struct type
         # to another, which is a little weird, but we don't support casting from a
-        # smaller type to a larger as we often wouldn't know how to get the extra data:
-        val_f = target.EvaluateExpression("f")
-        bad_cast = val_s.Cast(val_f.GetType())
-        self.assertFailure(
-            bad_cast.GetError(),
-            "Can only cast to a type that is equal to or smaller than the orignal type.",
-        )
-        weird_cast = val_f.Cast(val_s.GetType())
+        # smaller type to a larger when the underlying data is not in the inferior,
+        # since then we have no way to fetch the out-of-bounds values.
+        # For an expression that references a variable, or a FindVariable result,
+        # or an SBValue made from an address and a type, we can get back to the target,
+        # so those will work.  Make sure they do and get the right extra values as well.
+
+        # We're casting everything to the type of "f", so get that first:
+        f_var = frame0.FindVariable("f")
+        self.assertSuccess(f_var.error, "Got f")
+        bigger_type = f_var.GetType()
+
+        # First try a value that we got from FindVariable
+        container = frame0.FindVariable("my_container")
+        self.assertSuccess(container.error, "Found my_container")
+        fv_small = container.GetValueForExpressionPath(".data.small")
+        self.assertSuccess(fv_small.error, "Found small in my_container")
+        fv_cast = fv_small.Cast(bigger_type)
+        self.assertSuccess(fv_cast.error, "Can cast up from FindVariable")
+        child_checks = [
+            ValueCheck(name="a", value="33", type="int"),
+            ValueCheck(name="b", value="44", type="int"),
+            ValueCheck(name="c", value="55", type="int"),
+        ]
+        cast_check = ValueCheck(type=bigger_type.name, children=child_checks)
+
+        # Now try one we made with expr.  This one should fail, because expr
+        # stores the "canonical value" in host memory, and doesn't know how
+        # to augment that from the live address.
+        expr_cont = frame0.EvaluateExpression("my_container")
+        self.assertSuccess(expr_cont.error, "Got my_container by expr")
+        expr_small = expr_cont.GetValueForExpressionPath(".data.small")
+        self.assertSuccess(expr_small.error, "Got small by expr")
+        expr_cast = expr_small.Cast(bigger_type)
+        self.assertFailure(expr_cast.error, msg="Cannot cast expr result")
+
+        # Now try one we made with CreateValueFromAddress.  That will succeed
+        # because this directly tracks the inferior memory.
+        small_addr = fv_small.addr
+        self.assertTrue(small_addr.IsValid())
+        small_type = fv_small.GetType()
+        vfa_small = target.CreateValueFromAddress(
+            "small_from_addr", small_addr, small_type
+        )
+        self.assertSuccess(vfa_small.error, "Made small from address")
+        vfa_cast = vfa_small.Cast(bigger_type)
+        self.assertSuccess(vfa_cast.error, "Made a cast from vfa_small")
+        cast_check.check_value(self, vfa_cast, "Cast of ValueFromAddress succeeds")
+
+        # Next try ValueObject created from data.  They should fail as there's no
+        # way to grow the data:
+        data_small = target.CreateValueFromData(
+            "small_from_data", fv_small.data, fv_small.type
+        )
+        self.assertSuccess(data_small.error, "Made a valid object from data")
+        data_cast = data_small.Cast(bigger_type)
+        self.assertFailure(data_cast.error, msg="Cannot cast data backed SBValue")
+
+        # Now check casting from a larger type to a smaller, we can always do this,
+        # so just test one case:
+        weird_cast = f_var.Cast(val_s.GetType())
         self.assertSuccess(weird_cast.GetError(), "Can cast from a larger to a smaller")
         self.assertEqual(
             weird_cast.GetChildMemberWithName("a").GetValueAsSigned(0),
diff --git a/lldb/test/API/python_api/value/main.c b/lldb/test/API/python_api/value/main.c
index 672b0df376dc5..cdb2aa2f6147b 100644
--- a/lldb/test/API/python_api/value/main.c
+++ b/lldb/test/API/python_api/value/main.c
@@ -22,7 +22,7 @@ const char *weekdays[5] = { "Monday",
 const char **g_table[2] = { days_of_week, weekdays };
 
 typedef int MyInt;
-
+  
 struct MyStruct
 {
   int a;
@@ -36,6 +36,15 @@ struct MyBiggerStruct
   int c;
 };
 
+struct Container
+{
+  int discriminator;
+  union Data {
+    struct MyStruct small;
+    struct MyBiggerStruct big;
+  } data;
+};
+  
 int main (int argc, char const *argv[])
 {
     uint32_t uinthex = 0xE0A35F10;
@@ -43,8 +52,10 @@ int main (int argc, char const *argv[])
 
     int i;
     MyInt a = 12345;
-    struct MyStruct s = { 11, 22 };
+    struct MyStruct s = {11, 22};
     struct MyBiggerStruct f = { 33, 44, 55 };
+    struct Container my_container;
+    my_container.data.big = f;
     int *my_int_ptr = &g_my_int;
     printf("my_int_ptr points to location %p\n", my_int_ptr);
     int *fixed_int_ptr = (int*)(void*)0xAA;
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s b/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
index bc462ca32e9ce..8633d02f492e6 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/DW_AT_declaration-with-children.s
@@ -12,7 +12,7 @@
 target var a
 # CHECK-LABEL: target var a
 # FIXME: This should also produce some kind of an error.
-# CHECK: (A) a = {}
+# CHECK: (A) a = <incomplete type "A">
 expr a
 # CHECK-LABEL: expr a
 # CHECK: incomplete type 'A' where a complete type is required
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
index e94b10a68d4e9..548dd6cdbc275 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-types-missing-signature.test
@@ -21,6 +21,6 @@ RUN: not %lldb %t -b -o "expression (EC) 1" 2>&1 | FileCheck --check-prefix=PRIN
 PRINTEC: use of undeclared identifier 'EC'
 
 RUN: %lldb %t -b -o "target variable a e ec" | FileCheck --check-prefix=VARS %s
-VARS: (const (unnamed struct)) a = {}
+VARS: (const (unnamed struct)) a = <incomplete type "const (unnamed struct)">
 VARS: (const (unnamed enum)) e = 0x1
 VARS: (const (unnamed enum)) ec = 0x1
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 494d8abeb64d2..bd141619d03fd 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -15,18 +15,7 @@ if(NOT LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 endif()
 
-if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 19)
-endif()
-if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
-endif()
-if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
-endif()
-if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
-endif()
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/LLVMVersion.cmake)
 
 set_directory_properties(PROPERTIES LLVM_VERSION_MAJOR "${LLVM_VERSION_MAJOR}")
 
@@ -723,8 +712,6 @@ if(LLVM_INDIVIDUAL_TEST_COVERAGE)
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
-option(LLVM_PARALLEL_LIT "Enable multiple lit suites to run in parallel" OFF)
-
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 5b5d6742a3d0b..df72a9e7e6be1 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1951,18 +1951,11 @@ function(add_lit_target target comment)
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
   if (ARG_UNPARSED_ARGUMENTS)
-    if (LLVM_PARALLEL_LIT)
-     add_custom_target(${target}
-       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-       COMMENT "${comment}"
-       )
-    else()
-     add_custom_target(${target}
-       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-       COMMENT "${comment}"
-       USES_TERMINAL
-       )
-    endif()
+    add_custom_target(${target}
+      COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+      COMMENT "${comment}"
+      USES_TERMINAL
+      )
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index eca2962cf8207..745a8354f1189 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -36,7 +36,7 @@ string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
   "Define the maximum number of concurrent compilation jobs (Ninja only).")
-if(LLVM_RAM_PER_COMPILE_JOB OR LLVM_RAM_PER_LINK_JOB)
+if(LLVM_RAM_PER_COMPILE_JOB OR LLVM_RAM_PER_LINK_JOB OR LLVM_RAM_PER_TABLEGEN_JOB)
   cmake_host_system_information(RESULT available_physical_memory QUERY AVAILABLE_PHYSICAL_MEMORY)
   cmake_host_system_information(RESULT number_of_logical_cores QUERY NUMBER_OF_LOGICAL_CORES)
 endif()
@@ -86,6 +86,28 @@ elseif(LLVM_PARALLEL_LINK_JOBS)
   message(WARNING "Job pooling is only available with Ninja generators.")
 endif()
 
+set(LLVM_PARALLEL_TABLEGEN_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent tablegen jobs (Ninja only).")
+if(LLVM_RAM_PER_TABLEGEN_JOB)
+  math(EXPR jobs_with_sufficient_memory "${available_physical_memory} / ${LLVM_RAM_PER_TABLEGEN_JOB}" OUTPUT_FORMAT DECIMAL)
+  if (jobs_with_sufficient_memory LESS 1)
+    set(jobs_with_sufficient_memory 1)
+  endif()
+  if (jobs_with_sufficient_memory LESS number_of_logical_cores)
+    set(LLVM_PARALLEL_TABLEGEN_JOBS "${jobs_with_sufficient_memory}")
+  else()
+    set(LLVM_PARALLEL_TABLEGEN_JOBS "${number_of_logical_cores}")
+  endif()
+endif()
+if(LLVM_PARALLEL_TABLEGEN_JOBS)
+  if(NOT CMAKE_GENERATOR MATCHES "Ninja")
+    message(WARNING "Job pooling is only available with Ninja generators.")
+  else()
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS tablegen_job_pool=${LLVM_PARALLEL_TABLEGEN_JOBS})
+    # Job pool for tablegen is set on the add_custom_command
+  endif()
+endif()
+
 if( LLVM_ENABLE_ASSERTIONS )
   # MSVC doesn't like _DEBUG on release builds. See PR 4379.
   if( NOT MSVC )
diff --git a/llvm/cmake/modules/TableGen.cmake b/llvm/cmake/modules/TableGen.cmake
index 1d18fdde2bb98..df91598c404f5 100644
--- a/llvm/cmake/modules/TableGen.cmake
+++ b/llvm/cmake/modules/TableGen.cmake
@@ -125,6 +125,12 @@ function(tablegen project ofn)
   set(tablegen_exe ${${project}_TABLEGEN_EXE})
   set(tablegen_depends ${${project}_TABLEGEN_TARGET} ${tablegen_exe})
 
+  if(LLVM_PARALLEL_TABLEGEN_JOBS)
+    set(LLVM_TABLEGEN_JOB_POOL JOB_POOL tablegen_job_pool)
+  else()
+    set(LLVM_TABLEGEN_JOB_POOL "")
+  endif()
+
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}
     COMMAND ${tablegen_exe} ${ARG_UNPARSED_ARGUMENTS} -I ${CMAKE_CURRENT_SOURCE_DIR}
     ${tblgen_includes}
@@ -139,6 +145,7 @@ function(tablegen project ofn)
       ${local_tds} ${global_tds}
     ${LLVM_TARGET_DEFINITIONS_ABSOLUTE}
     ${LLVM_TARGET_DEPENDS}
+    ${LLVM_TABLEGEN_JOB_POOL}
     COMMENT "Building ${ofn}..."
     )
 
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index be5da5652e31e..d2f66d71d39af 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,11 +762,8 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
-**LLVM_PARALLEL_LIT**:BOOL
-  Defaults to ``OFF``. If set to ``OFF``, lit testsuites will be configured
-  with CMake's ``USES_TERMINAL`` flag to give direct access to the terminal. If
-  set to ``ON``, that flag will be removed allowing Ninja to schedule multiple
-  lit testsuites in parallel.
+**LLVM_PARALLEL_TABLEGEN_JOBS**:STRING
+  Define the maximum number of concurrent tablegen jobs.
 
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
@@ -781,6 +778,11 @@ enabled sub-projects. Nearly all of these variable names begin with
   to be sure its not terminated in your memory restricted environment. On ELF
   platforms also consider ``LLVM_USE_SPLIT_DWARF`` in Debug build.
 
+**LLVM_RAM_PER_TABLEGEN_JOB**:STRING
+  Calculates the amount of Ninja tablegen jobs according to available resources.
+  Value has to be in MB, overwrites LLVM_PARALLEL_TABLEGEN_JOBS. Tablegen jobs
+  will be between one and amount of logical cores.
+
 **LLVM_PROFDATA_FILE**:PATH
   Path to a profdata file to pass into clang's -fprofile-instr-use flag. This
   can only be specified if you're building with clang.
diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 7634199babbad..705f6427d9ed5 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -90,11 +90,11 @@ Getting the Source Code and Building LLVM
        is installed on your system. This can dramatically speed up link times
        if the default linker is slow.
 
-     * ``-DLLVM_PARALLEL_{COMPILE,LINK}_JOBS=N`` --- Limit the number of
-       compile/link jobs running in parallel at the same time. This is
+     * ``-DLLVM_PARALLEL_{COMPILE,LINK,TABLEGEN}_JOBS=N`` --- Limit the number of
+       compile/link/tablegen jobs running in parallel at the same time. This is
        especially important for linking since linking can use lots of memory. If
        you run into memory issues building LLVM, try setting this to limit the
-       maximum number of compile/link jobs running at the same time.
+       maximum number of compile/link/tablegen jobs running at the same time.
 
    * ``cmake --build build [--target <target>]`` or the build system specified
      above directly.
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index dda367607d043..f9f9e1186460e 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -607,6 +607,41 @@ See the LLVM LangRef entry on '``llvm.lround.*'`` for details on behaviour.
 Vector Specific Operations
 --------------------------
 
+G_INSERT_SUBVECTOR
+^^^^^^^^^^^^^^^^^^
+
+Insert the second source vector into the first source vector. The index operand
+represents the starting index in the first source vector at which the second
+source vector should be inserted into.
+
+The index must be a constant multiple of the second source vector's minimum
+vector length. If the vectors are scalable, then the index is first scaled by
+the runtime scaling factor. The indices inserted in the source vector must be
+valid indicies of that vector. If this condition cannot be determined statically
+but is false at runtime, then the result vector is undefined.
+
+.. code-block:: none
+
+  %2:_(<vscale x 4 x i64>) = G_INSERT_SUBVECTOR %0:_(<vscale x 4 x i64>), %1:_(<vscale x 2 x i64>), 0
+
+G_EXTRACT_SUBVECTOR
+^^^^^^^^^^^^^^^^^^^
+
+Extract a vector of destination type from the source vector. The index operand
+represents the starting index from which a subvector is extracted from
+the source vector.
+
+The index must be a constant multiple of the source vector's minimum vector
+length. If the source vector is a scalable vector, then the index is first
+scaled by the runtime scaling factor. The indices extracted from the source
+vector must be valid indicies of that vector. If this condition cannot be
+determined statically but is false at runtime, then the result vector is
+undefined.
+
+.. code-block:: none
+
+  %3:_(<vscale x 4 x i64>) = G_EXTRACT_SUBVECTOR %2:_(<vscale x 8 x i64>), 2
+
 G_CONCAT_VECTORS
 ^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b70220dec9261..77ec72f176d6e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -143,7 +143,7 @@ It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
 that defines the type and name of value produced.
 
-.. _strings:
+.. _string_constants:
 
 String constants
 ----------------
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 6762b1b360d5e..4732eaf4ee27c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1121,6 +1121,25 @@ class MachineIRBuilder {
   MachineInstrBuilder buildConcatVectors(const DstOp &Res,
                                          ArrayRef<Register> Ops);
 
+  /// Build and insert `Res = G_INSERT_SUBVECTOR Src0, Src1, Idx`.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Src0, and \p Src1 must be generic virtual registers with
+  /// vector type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildInsertSubvector(const DstOp &Res, const SrcOp &Src0,
+                                           const SrcOp &Src1, unsigned Index);
+
+  /// Build and insert `Res = G_EXTRACT_SUBVECTOR Src, Idx0`.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res and \p Src must be generic virtual registers with vector type.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildExtractSubvector(const DstOp &Res, const SrcOp &Src,
+                                            unsigned Index);
+
   MachineInstrBuilder buildInsert(const DstOp &Res, const SrcOp &Src,
                                   const SrcOp &Op, unsigned Index);
 
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 3f0fc160f9ea4..09d9a0b4ec402 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -243,15 +243,6 @@ class MachineRegisterInfo {
   /// Returns true if the updated CSR list was initialized and false otherwise.
   bool isUpdatedCSRsInitialized() const { return IsUpdatedCSRsInitialized; }
 
-  /// Returns true if a register can be used as an argument to a function.
-  bool isArgumentRegister(MCRegister Reg) const;
-
-  /// Returns true if a register is a fixed register.
-  bool isFixedRegister(MCRegister Reg) const;
-
-  /// Returns true if a register is a general purpose register.
-  bool isGeneralPurposeRegister(MCRegister Reg) const;
-
   /// Disables the register from the list of CSRs.
   /// I.e. the register will not appear as part of the CSR mask.
   /// \see UpdatedCalleeSavedRegs.
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 0c2a02514ba0e..7dd8a329029a3 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -848,6 +848,11 @@ class AttributeList {
     return getAttributeAtIndex(FunctionIndex, Kind);
   }
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttr(Attribute::AttrKind Kind) const {
+    return getAttributeAtIndex(ReturnIndex, Kind);
+  }
+
   /// Return the alignment of the return value.
   MaybeAlign getRetAlignment() const;
 
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index edec161b39715..94af17af8160e 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -38,6 +38,9 @@ namespace llvm {
   class Module;
   class Value;
   class DbgAssignIntrinsic;
+  class DbgRecord;
+
+  using DbgInstPtr = PointerUnion<Instruction *, DbgRecord *>;
 
   class DIBuilder {
     Module &M;
@@ -90,13 +93,17 @@ namespace llvm {
     void trackIfUnresolved(MDNode *N);
 
     /// Internal helper for insertDeclare.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             BasicBlock *InsertBB, Instruction *InsertBefore);
 
     /// Internal helper for insertLabel.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           BasicBlock *InsertBB, Instruction *InsertBefore);
+
+    /// Internal helper. Track metadata if untracked and insert \p DPV.
+    void insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
+                       Instruction *InsertBefore, bool InsertAtHead = false);
 
     /// Internal helper with common code used by insertDbg{Value,Addr}Intrinsic.
     Instruction *insertDbgIntrinsic(llvm::Function *Intrinsic, llvm::Value *Val,
@@ -106,10 +113,11 @@ namespace llvm {
                                     Instruction *InsertBefore);
 
     /// Internal helper for insertDbgValueIntrinsic.
-    Instruction *
-    insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo,
-                            DIExpression *Expr, const DILocation *DL,
-                            BasicBlock *InsertBB, Instruction *InsertBefore);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       BasicBlock *InsertBB,
+                                       Instruction *InsertBefore);
 
   public:
     /// Construct a builder for a module.
@@ -921,9 +929,9 @@ namespace llvm {
     /// \param Expr        A complex location expression.
     /// \param DL          Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               BasicBlock *InsertAtEnd);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.assign intrinsic call.
     /// \param LinkedInstr   Instruction with a DIAssignID to link with the new
@@ -939,11 +947,10 @@ namespace llvm {
     /// \param DL            Debug info location, usually: (line: 0,
     ///                      column: 0, scope: var-decl-scope). See
     ///                      getDebugValueLoc.
-    DbgAssignIntrinsic *insertDbgAssign(Instruction *LinkedInstr, Value *Val,
-                                        DILocalVariable *SrcVar,
-                                        DIExpression *ValExpr, Value *Addr,
-                                        DIExpression *AddrExpr,
-                                        const DILocation *DL);
+    DbgInstPtr insertDbgAssign(Instruction *LinkedInstr, Value *Val,
+                               DILocalVariable *SrcVar, DIExpression *ValExpr,
+                               Value *Addr, DIExpression *AddrExpr,
+                               const DILocation *DL);
 
     /// Insert a new llvm.dbg.declare intrinsic call.
     /// \param Storage      llvm::Value of the variable
@@ -951,23 +958,23 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
-                               DIExpression *Expr, const DILocation *DL,
-                               Instruction *InsertBefore);
+    DbgInstPtr insertDeclare(llvm::Value *Storage, DILocalVariable *VarInfo,
+                             DIExpression *Expr, const DILocation *DL,
+                             Instruction *InsertBefore);
 
     /// Insert a new llvm.dbg.label intrinsic call.
     /// \param LabelInfo    Label's debug info descriptor.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             Instruction *InsertBefore);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           Instruction *InsertBefore);
 
     /// Insert a new llvm.dbg.label intrinsic call.
     /// \param LabelInfo    Label's debug info descriptor.
     /// \param DL           Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                             BasicBlock *InsertAtEnd);
+    DbgInstPtr insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                           BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
@@ -975,11 +982,10 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertAtEnd Location for the new intrinsic.
-    Instruction *insertDbgValueIntrinsic(llvm::Value *Val,
-                                         DILocalVariable *VarInfo,
-                                         DIExpression *Expr,
-                                         const DILocation *DL,
-                                         BasicBlock *InsertAtEnd);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       BasicBlock *InsertAtEnd);
 
     /// Insert a new llvm.dbg.value intrinsic call.
     /// \param Val          llvm::Value of the variable
@@ -987,11 +993,10 @@ namespace llvm {
     /// \param Expr         A complex location expression.
     /// \param DL           Debug info location.
     /// \param InsertBefore Location for the new intrinsic.
-    Instruction *insertDbgValueIntrinsic(llvm::Value *Val,
-                                         DILocalVariable *VarInfo,
-                                         DIExpression *Expr,
-                                         const DILocation *DL,
-                                         Instruction *InsertBefore);
+    DbgInstPtr insertDbgValueIntrinsic(llvm::Value *Val,
+                                       DILocalVariable *VarInfo,
+                                       DIExpression *Expr, const DILocation *DL,
+                                       Instruction *InsertBefore);
 
     /// Replace the vtable holder in the given type.
     ///
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index cb87a44980321..d96d506a9b05d 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -430,6 +430,9 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
   /// Return the attribute for the given attribute kind.
   Attribute getFnAttribute(StringRef Kind) const;
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttribute(Attribute::AttrKind Kind) const;
+
   /// For a string attribute \p Kind, parse attribute as an integer.
   ///
   /// \returns \p Default if attribute is not present.
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 0e81d3b391a08..fed21b992e3d1 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1909,6 +1909,18 @@ class CallBase : public Instruction {
   /// Determine whether the return value has the given attribute.
   bool hasRetAttr(StringRef Kind) const { return hasRetAttrImpl(Kind); }
 
+  /// Return the attribute for the given attribute kind for the return value.
+  Attribute getRetAttr(Attribute::AttrKind Kind) const {
+    Attribute RetAttr = Attrs.getRetAttr(Kind);
+    if (RetAttr.isValid())
+      return RetAttr;
+
+    // Look at the callee, if available.
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().getRetAttr(Kind);
+    return Attribute();
+  }
+
   /// Determine whether the argument or parameter has the given attribute.
   bool paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 46dbf0c2baa5f..06d2c90f7b0f6 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -402,12 +402,12 @@ struct KnownBits {
   /// Compute known bits for lshr(LHS, RHS).
   /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
   static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS,
-                        bool ShAmtNonZero = false);
+                        bool ShAmtNonZero = false, bool Exact = false);
 
   /// Compute known bits for ashr(LHS, RHS).
   /// NOTE: RHS (shift amount) bitwidth doesn't need to be the same as LHS.
   static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS,
-                        bool ShAmtNonZero = false);
+                        bool ShAmtNonZero = false, bool Exact = false);
 
   /// Determine if these known bits always give the same ICMP_EQ result.
   static std::optional<bool> eq(const KnownBits &LHS, const KnownBits &RHS);
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 94fba491148b2..3dade14f043b6 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -727,6 +727,12 @@ HANDLE_TARGET_OPCODE(G_BR)
 /// Generic branch to jump table entry.
 HANDLE_TARGET_OPCODE(G_BRJT)
 
+/// Generic insert subvector.
+HANDLE_TARGET_OPCODE(G_INSERT_SUBVECTOR)
+
+/// Generic extract subvector.
+HANDLE_TARGET_OPCODE(G_EXTRACT_SUBVECTOR)
+
 /// Generic insertelement.
 HANDLE_TARGET_OPCODE(G_INSERT_VECTOR_ELT)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index d967885aa2d75..8dc84fb0ba052 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1426,6 +1426,20 @@ def G_WRITE_REGISTER : GenericInstruction {
 // Vector ops
 //------------------------------------------------------------------------------
 
+// Generic insert subvector.
+def G_INSERT_SUBVECTOR : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type1:$src1, untyped_imm_0:$idx);
+  let hasSideEffects = false;
+}
+
+// Generic extract subvector.
+def G_EXTRACT_SUBVECTOR : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, untyped_imm_0:$idx);
+  let hasSideEffects = false;
+}
+
 // Generic insertelement.
 def G_INSERT_VECTOR_ELT : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index d7ce088cad49f..37df9589e30d6 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -241,7 +241,7 @@ class TargetMachine {
 
   bool isPositionIndependent() const;
 
-  bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const;
+  bool shouldAssumeDSOLocal(const GlobalValue *GV) const;
 
   /// Returns true if this target uses emulated TLS.
   bool useEmulatedTLS() const;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 8c48174b9f525..ce651783caf16 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3729,6 +3729,26 @@ static Value *simplifyICmpWithIntrinsicOnLHS(CmpInst::Predicate Pred,
   }
 }
 
+/// Helper method to get range from metadata or attribute.
+static std::optional<ConstantRange> getRange(Value *V,
+                                             const InstrInfoQuery &IIQ) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (MDNode *MD = IIQ.getMetadata(I, LLVMContext::MD_range))
+      return getConstantRangeFromMetadata(*MD);
+
+  Attribute Range;
+  if (const Argument *A = dyn_cast<Argument>(V)) {
+    Range = A->getAttribute(llvm::Attribute::Range);
+  } else if (const CallBase *CB = dyn_cast<CallBase>(V)) {
+    Range = CB->getRetAttr(llvm::Attribute::Range);
+  }
+
+  if (Range.isValid())
+    return Range.getRange();
+
+  return std::nullopt;
+}
+
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -3776,24 +3796,14 @@ static Value *simplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
-  if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
-    auto RHS_Instr = cast<Instruction>(RHS);
-    auto LHS_Instr = cast<Instruction>(LHS);
-
-    if (Q.IIQ.getMetadata(RHS_Instr, LLVMContext::MD_range) &&
-        Q.IIQ.getMetadata(LHS_Instr, LLVMContext::MD_range)) {
-      auto RHS_CR = getConstantRangeFromMetadata(
-          *RHS_Instr->getMetadata(LLVMContext::MD_range));
-      auto LHS_CR = getConstantRangeFromMetadata(
-          *LHS_Instr->getMetadata(LLVMContext::MD_range));
-
-      if (LHS_CR.icmp(Pred, RHS_CR))
+  if (std::optional<ConstantRange> RhsCr = getRange(RHS, Q.IIQ))
+    if (std::optional<ConstantRange> LhsCr = getRange(LHS, Q.IIQ)) {
+      if (LhsCr->icmp(Pred, *RhsCr))
         return ConstantInt::getTrue(ITy);
 
-      if (LHS_CR.icmp(CmpInst::getInversePredicate(Pred), RHS_CR))
+      if (LhsCr->icmp(CmpInst::getInversePredicate(Pred), *RhsCr))
         return ConstantInt::getFalse(ITy);
     }
-  }
 
   // Compare of cast, for example (zext X) != 0 -> X != 0
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6d0e79e11eed4..371ad41ee9656 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1142,9 +1142,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::LShr: {
-    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt,
-                 bool ShAmtNonZero) {
-      return KnownBits::lshr(KnownVal, KnownAmt, ShAmtNonZero);
+    bool Exact = Q.IIQ.isExact(cast<BinaryOperator>(I));
+    auto KF = [Exact](const KnownBits &KnownVal, const KnownBits &KnownAmt,
+                      bool ShAmtNonZero) {
+      return KnownBits::lshr(KnownVal, KnownAmt, ShAmtNonZero, Exact);
     };
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
                                       KF);
@@ -1155,9 +1156,10 @@ static void computeKnownBitsFromOperator(const Operator *I,
     break;
   }
   case Instruction::AShr: {
-    auto KF = [](const KnownBits &KnownVal, const KnownBits &KnownAmt,
-                 bool ShAmtNonZero) {
-      return KnownBits::ashr(KnownVal, KnownAmt, ShAmtNonZero);
+    bool Exact = Q.IIQ.isExact(cast<BinaryOperator>(I));
+    auto KF = [Exact](const KnownBits &KnownVal, const KnownBits &KnownAmt,
+                      bool ShAmtNonZero) {
+      return KnownBits::ashr(KnownVal, KnownAmt, ShAmtNonZero, Exact);
     };
     computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
                                       KF);
@@ -6129,6 +6131,8 @@ void llvm::getUnderlyingObjects(const Value *V,
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         append_range(Worklist, PN->incoming_values());
+      else
+        Objects.push_back(P);
       continue;
     }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 28e5bf85ca9ce..9b12d443c96e9 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -877,6 +877,21 @@ MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst,
   return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildInsertSubvector(const DstOp &Res,
+                                                           const SrcOp &Src0,
+                                                           const SrcOp &Src1,
+                                                           unsigned Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_SUBVECTOR, Res,
+                    {Src0, Src1, uint64_t(Idx)});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtractSubvector(const DstOp &Res,
+                                                            const SrcOp &Src,
+                                                            unsigned Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_SUBVECTOR, Res,
+                    {Src, uint64_t(Idx)});
+}
+
 MachineInstrBuilder
 MachineIRBuilder::buildInsertVectorElement(const DstOp &Res, const SrcOp &Val,
                                            const SrcOp &Elt, const SrcOp &Idx) {
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index a2b5cbf7bad9f..4941d5b01ae0f 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -641,7 +641,7 @@ bool GlobalMergeImpl::run(Module &M) {
       continue;
 
     // It's not safe to merge globals that may be preempted
-    if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
+    if (TM && !TM->shouldAssumeDSOLocal(&GV))
       continue;
 
     if (!(Opt.MergeExternal && GV.hasExternalLinkage()) &&
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 55d7c8370e9c4..b0c1838b3ff0e 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -659,15 +659,3 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   }
   return false;
 }
-
-bool MachineRegisterInfo::isArgumentRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isArgumentRegister(*MF, Reg);
-}
-
-bool MachineRegisterInfo::isFixedRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isFixedRegister(*MF, Reg);
-}
-
-bool MachineRegisterInfo::isGeneralPurposeRegister(MCRegister Reg) const {
-  return getTargetRegisterInfo()->isGeneralPurposeRegister(*MF, Reg);
-}
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 9003f1dded87a..90cbf097370de 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1613,6 +1613,104 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_BSWAP size must be a multiple of 16 bits", MI);
     break;
   }
+  case TargetOpcode::G_INSERT_SUBVECTOR: {
+    const MachineOperand &Src0Op = MI->getOperand(1);
+    if (!Src0Op.isReg()) {
+      report("G_INSERT_SUBVECTOR first source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &Src1Op = MI->getOperand(2);
+    if (!Src1Op.isReg()) {
+      report("G_INSERT_SUBVECTOR second source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &IndexOp = MI->getOperand(3);
+    if (!IndexOp.isImm()) {
+      report("G_INSERT_SUBVECTOR index must be an immediate", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src0Ty = MRI->getType(Src0Op.getReg());
+    LLT Src1Ty = MRI->getType(Src1Op.getReg());
+
+    if (!DstTy.isVector()) {
+      report("Destination type must be a vector", MI);
+      break;
+    }
+
+    if (!Src0Ty.isVector()) {
+      report("First source must be a vector", MI);
+      break;
+    }
+
+    if (!Src1Ty.isVector()) {
+      report("Second source must be a vector", MI);
+      break;
+    }
+
+    if (DstTy != Src0Ty) {
+      report("Destination type must match the first source vector type", MI);
+      break;
+    }
+
+    if (Src0Ty.getElementType() != Src1Ty.getElementType()) {
+      report("Element type of source vectors must be the same", MI);
+      break;
+    }
+
+    if (IndexOp.getImm() != 0 &&
+        Src1Ty.getElementCount().getKnownMinValue() % IndexOp.getImm() != 0) {
+      report("Index must be a multiple of the second source vector's "
+             "minimum vector length",
+             MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_EXTRACT_SUBVECTOR: {
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    if (!SrcOp.isReg()) {
+      report("G_EXTRACT_SUBVECTOR first source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &IndexOp = MI->getOperand(2);
+    if (!IndexOp.isImm()) {
+      report("G_EXTRACT_SUBVECTOR index must be an immediate", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(SrcOp.getReg());
+
+    if (!DstTy.isVector()) {
+      report("Destination type must be a vector", MI);
+      break;
+    }
+
+    if (!SrcTy.isVector()) {
+      report("First source must be a vector", MI);
+      break;
+    }
+
+    if (DstTy.getElementType() != SrcTy.getElementType()) {
+      report("Element type of vectors must be the same", MI);
+      break;
+    }
+
+    if (IndexOp.getImm() != 0 &&
+        SrcTy.getElementCount().getKnownMinValue() % IndexOp.getImm() != 0) {
+      report("Index must be a multiple of the source vector's minimum vector "
+             "length",
+             MI);
+      break;
+    }
+
+    break;
+  }
   case TargetOpcode::G_SHUFFLE_VECTOR: {
     const MachineOperand &MaskOp = MI->getOperand(3);
     if (!MaskOp.isShuffleMask()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 06fe716a22db0..7a0c1c328df1f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3485,7 +3485,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRL:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::lshr(Known, Known2);
+    Known = KnownBits::lshr(Known, Known2, /*ShAmtNonZero=*/false,
+                            Op->getFlags().hasExact());
 
     // Minimum shift high bits are known zero.
     if (const APInt *ShMinAmt =
@@ -3495,7 +3496,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRA:
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::ashr(Known, Known2);
+    Known = KnownBits::ashr(Known, Known2, /*ShAmtNonZero=*/false,
+                            Op->getFlags().hasExact());
     break;
   case ISD::FSHL:
   case ISD::FSHR:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a639cba5e35a8..b3dc9de713731 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -491,7 +491,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
   // If the address is not even local to this DSO we will have to load it from
   // a got and then add the offset.
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (!TM.shouldAssumeDSOLocal(GV))
     return false;
 
   // If the code is position independent we will have to add a base register.
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 48ad8de778010..b0830308908d6 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -136,6 +136,7 @@ class IRPromoter {
 
 class TypePromotionImpl {
   unsigned TypeSize = 0;
+  const TargetLowering *TLI = nullptr;
   LLVMContext *Ctx = nullptr;
   unsigned RegisterBitWidth = 0;
   SmallPtrSet<Value *, 16> AllVisited;
@@ -272,64 +273,58 @@ bool TypePromotionImpl::isSink(Value *V) {
 
 /// Return whether this instruction can safely wrap.
 bool TypePromotionImpl::isSafeWrap(Instruction *I) {
-  // We can support a potentially wrapping instruction (I) if:
+  // We can support a potentially wrapping Add/Sub instruction (I) if:
   // - It is only used by an unsigned icmp.
   // - The icmp uses a constant.
-  // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
-  //   around zero to become a larger number than before.
   // - The wrapping instruction (I) also uses a constant.
   //
-  // We can then use the two constants to calculate whether the result would
-  // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
-  // just underflows the range, the icmp would give the same result whether the
-  // result has been truncated or not. We calculate this by:
-  // - Zero extending both constants, if needed, to RegisterBitWidth.
-  // - Take the absolute value of I's constant, adding this to the icmp const.
-  // - Check that this value is not out of range for small type. If it is, it
-  //   means that it has underflowed enough to wrap around the icmp constant.
+  // This a common pattern emitted to check if a value is within a range.
   //
   // For example:
   //
-  // %sub = sub i8 %a, 2
-  // %cmp = icmp ule i8 %sub, 254
+  // %sub = sub i8 %a, C1
+  // %cmp = icmp ule i8 %sub, C2
+  //
+  // or
+  //
+  // %add = add i8 %a, C1
+  // %cmp = icmp ule i8 %add, C2.
   //
-  // If %a = 0, %sub = -2 == FE == 254
-  // But if this is evalulated as a i32
-  // %sub = -2 == FF FF FF FE == 4294967294
-  // So the unsigned compares (i8 and i32) would not yield the same result.
+  // We will treat an add as though it were a subtract by -C1. To promote
+  // the Add/Sub we will zero extend the LHS and the subtracted amount. For Add,
+  // this means we need to negate the constant, zero extend to RegisterBitWidth,
+  // and negate in the larger type.
   //
-  // Another way to look at it is:
-  // %a - 2 <= 254
-  // %a + 2 <= 254 + 2
-  // %a <= 256
-  // And we can't represent 256 in the i8 format, so we don't support it.
+  // This will produce a value in the range [-zext(C1), zext(X)-zext(C1)] where
+  // C1 is the subtracted amount. This is either a small unsigned number or a
+  // large unsigned number in the promoted type.
   //
-  // Whereas:
+  // Now we need to correct the compare constant C2. Values >= C1 in the
+  // original add result range have been remapped to large values in the
+  // promoted range. If the compare constant fell into this range we need to
+  // remap it as well. We can do this as -(zext(-C2)).
   //
-  // %sub i8 %a, 1
+  // For example:
+  //
+  // %sub = sub i8 %a, 2
   // %cmp = icmp ule i8 %sub, 254
   //
-  // If %a = 0, %sub = -1 == FF == 255
-  // As i32:
-  // %sub = -1 == FF FF FF FF == 4294967295
+  // becomes
   //
-  // In this case, the unsigned compare results would be the same and this
-  // would also be true for ult, uge and ugt:
-  // - (255 < 254) == (0xFFFFFFFF < 254) == false
-  // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
-  // - (255 > 254) == (0xFFFFFFFF > 254) == true
-  // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
+  // %zext = zext %a to i32
+  // %sub = sub i32 %zext, 2
+  // %cmp = icmp ule i32 %sub, 4294967294
   //
-  // To demonstrate why we can't handle increasing values:
+  // Another example:
   //
-  // %add = add i8 %a, 2
-  // %cmp = icmp ult i8 %add, 127
+  // %sub = sub i8 %a, 1
+  // %cmp = icmp ule i8 %sub, 254
   //
-  // If %a = 254, %add = 256 == (i8 1)
-  // As i32:
-  // %add = 256
+  // becomes
   //
-  // (1 < 127) != (256 < 127)
+  // %zext = zext %a to i32
+  // %sub = sub i32 %zext, 1
+  // %cmp = icmp ule i32 %sub, 254
 
   unsigned Opc = I->getOpcode();
   if (Opc != Instruction::Add && Opc != Instruction::Sub)
@@ -356,21 +351,29 @@ bool TypePromotionImpl::isSafeWrap(Instruction *I) {
   APInt OverflowConst = cast<ConstantInt>(I->getOperand(1))->getValue();
   if (Opc == Instruction::Sub)
     OverflowConst = -OverflowConst;
-  if (!OverflowConst.isNonPositive())
-    return false;
+
+  // If the constant is positive, we will end up filling the promoted bits with
+  // all 1s. Make sure that results in a cheap add constant.
+  if (!OverflowConst.isNonPositive()) {
+    // We don't have the true promoted width, just use 64 so we can create an
+    // int64_t for the isLegalAddImmediate call.
+    if (OverflowConst.getBitWidth() >= 64)
+      return false;
+
+    APInt NewConst = -((-OverflowConst).zext(64));
+    if (!TLI->isLegalAddImmediate(NewConst.getSExtValue()))
+      return false;
+  }
 
   SafeWrap.insert(I);
 
-  // Using C1 = OverflowConst and C2 = ICmpConst, we can either prove that:
-  //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
-  //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
-  if (OverflowConst.sgt(ICmpConst)) {
-    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+  if (OverflowConst == 0 || OverflowConst.ugt(ICmpConst)) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for "
                       << "const of " << *I << "\n");
     return true;
   }
 
-  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for "
                     << "const of " << *I << " and " << *CI << "\n");
   SafeWrap.insert(CI);
   return true;
@@ -487,18 +490,24 @@ void IRPromoter::PromoteTree() {
         continue;
 
       if (auto *Const = dyn_cast<ConstantInt>(Op)) {
-        // For subtract, we don't need to sext the constant. We only put it in
+        // For subtract, we only need to zext the constant. We only put it in
         // SafeWrap because SafeWrap.size() is used elsewhere.
-        // For cmp, we need to sign extend a constant appearing in either
-        // operand. For add, we should only sign extend the RHS.
-        Constant *NewConst =
-            ConstantInt::get(Const->getContext(),
-                             (SafeWrap.contains(I) &&
-                              (I->getOpcode() == Instruction::ICmp || i == 1) &&
-                              I->getOpcode() != Instruction::Sub)
-                                 ? Const->getValue().sext(PromotedWidth)
-                                 : Const->getValue().zext(PromotedWidth));
-        I->setOperand(i, NewConst);
+        // For Add and ICmp we need to find how far the constant is from the
+        // top of its original unsigned range and place it the same distance
+        // from the top of its new unsigned range. We can do this by negating
+        // the constant, zero extending it, then negating in the new type.
+        APInt NewConst;
+        if (SafeWrap.contains(I)) {
+          if (I->getOpcode() == Instruction::ICmp)
+            NewConst = -((-Const->getValue()).zext(PromotedWidth));
+          else if (I->getOpcode() == Instruction::Add && i == 1)
+            NewConst = -((-Const->getValue()).zext(PromotedWidth));
+          else
+            NewConst = Const->getValue().zext(PromotedWidth);
+        } else
+          NewConst = Const->getValue().zext(PromotedWidth);
+
+        I->setOperand(i, ConstantInt::get(Const->getContext(), NewConst));
       } else if (isa<UndefValue>(Op))
         I->setOperand(i, ConstantInt::get(ExtTy, 0));
     }
@@ -917,7 +926,7 @@ bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
-  const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
+  TLI = SubtargetInfo->getTargetLowering();
   RegisterBitWidth =
       TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
   Ctx = &F.getParent()->getContext();
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 25aa326116451..673e2f68249cd 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -348,6 +348,8 @@ const Instruction* BasicBlock::getFirstNonPHI() const {
 
 BasicBlock::const_iterator BasicBlock::getFirstNonPHIIt() const {
   const Instruction *I = getFirstNonPHI();
+  if (!I)
+    return end();
   BasicBlock::const_iterator It = I->getIterator();
   // Set the head-inclusive bit to indicate that this iterator includes
   // any debug-info at the start of the block. This is a no-op unless the
@@ -752,8 +754,6 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // occur when a block is optimised away and the terminator has been moved
   // somewhere else.
   if (Src->empty()) {
-    assert(Dest != end() &&
-           "Transferring trailing DPValues to another trailing position");
     DPMarker *SrcTrailingDPValues = Src->getTrailingDPValues();
     if (!SrcTrailingDPValues)
       return;
@@ -1038,15 +1038,10 @@ void BasicBlock::insertDPValueAfter(DbgRecord *DPV, Instruction *I) {
 
 void BasicBlock::insertDPValueBefore(DbgRecord *DPV,
                                      InstListType::iterator Where) {
-  // We should never directly insert at the end of the block, new DPValues
-  // shouldn't be generated at times when there's no terminator.
-  assert(Where != end());
-  assert(Where->getParent() == this);
-  if (!Where->DbgMarker)
-    createMarker(Where);
+  assert(Where == end() || Where->getParent() == this);
   bool InsertAtHead = Where.getHeadBit();
-  createMarker(&*Where);
-  Where->DbgMarker->insertDPValue(DPV, InsertAtHead);
+  DPMarker *M = createMarker(Where);
+  M->insertDPValue(DPV, InsertAtHead);
 }
 
 DPMarker *BasicBlock::getNextMarker(Instruction *I) {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 62efaba025344..c0643f63c9725 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -925,35 +925,47 @@ DILexicalBlock *DIBuilder::createLexicalBlock(DIScope *Scope, DIFile *File,
                                      File, Line, Col);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    Instruction *InsertBefore) {
   return insertDeclare(Storage, VarInfo, Expr, DL, InsertBefore->getParent(),
                        InsertBefore);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    BasicBlock *InsertAtEnd) {
   // If this block already has a terminator then insert this intrinsic before
   // the terminator. Otherwise, put it at the end of the block.
   Instruction *InsertBefore = InsertAtEnd->getTerminator();
   return insertDeclare(Storage, VarInfo, Expr, DL, InsertAtEnd, InsertBefore);
 }
 
-DbgAssignIntrinsic *
-DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
-                           DILocalVariable *SrcVar, DIExpression *ValExpr,
-                           Value *Addr, DIExpression *AddrExpr,
-                           const DILocation *DL) {
+DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
+                                      DILocalVariable *SrcVar,
+                                      DIExpression *ValExpr, Value *Addr,
+                                      DIExpression *AddrExpr,
+                                      const DILocation *DL) {
+  auto *Link = cast_or_null<DIAssignID>(
+      LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
+  assert(Link && "Linked instruction must have DIAssign metadata attached");
+
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPVAssign(Val, SrcVar, ValExpr, Link, Addr,
+                                            AddrExpr, DL);
+    BasicBlock *InsertBB = LinkedInstr->getParent();
+    // Insert after LinkedInstr.
+    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+    Instruction *InsertBefore = NextIt == InsertBB->end() ? nullptr : &*NextIt;
+    insertDPValue(DPV, InsertBB, InsertBefore, true);
+    return DPV;
+  }
+
   LLVMContext &Ctx = LinkedInstr->getContext();
   Module *M = LinkedInstr->getModule();
   if (!AssignFn)
     AssignFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign);
 
-  auto *Link = LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID);
-  assert(Link && "Linked instruction must have DIAssign metadata attached");
-
   std::array<Value *, 6> Args = {
       MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
       MetadataAsValue::get(Ctx, SrcVar),
@@ -971,35 +983,36 @@ DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
   return DVI;
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  Instruction *InsertBefore) {
   return insertLabel(LabelInfo, DL,
                      InsertBefore ? InsertBefore->getParent() : nullptr,
                      InsertBefore);
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  BasicBlock *InsertAtEnd) {
   return insertLabel(LabelInfo, DL, InsertAtEnd, nullptr);
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                                DILocalVariable *VarInfo,
-                                                DIExpression *Expr,
-                                                const DILocation *DL,
-                                                Instruction *InsertBefore) {
-  Instruction *DVI = insertDbgValueIntrinsic(
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
+                                              DILocalVariable *VarInfo,
+                                              DIExpression *Expr,
+                                              const DILocation *DL,
+                                              Instruction *InsertBefore) {
+  DbgInstPtr DVI = insertDbgValueIntrinsic(
       V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
       InsertBefore);
-  cast<CallInst>(DVI)->setTailCall();
+  if (DVI.is<Instruction *>())
+    cast<CallInst>(DVI.get<Instruction *>())->setTailCall();
   return DVI;
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
-                                                DILocalVariable *VarInfo,
-                                                DIExpression *Expr,
-                                                const DILocation *DL,
-                                                BasicBlock *InsertAtEnd) {
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V,
+                                              DILocalVariable *VarInfo,
+                                              DIExpression *Expr,
+                                              const DILocation *DL,
+                                              BasicBlock *InsertAtEnd) {
   return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
 }
 
@@ -1023,24 +1036,37 @@ static Function *getDeclareIntrin(Module &M) {
   return Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(
+DbgInstPtr DIBuilder::insertDbgValueIntrinsic(
     llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
     const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPValue(Val, VarInfo, Expr, DL);
+    insertDPValue(DPV, InsertBB, InsertBefore);
+    return DPV;
+  }
+
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
   return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB,
                             InsertBefore);
 }
 
-Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
-                                      DIExpression *Expr, const DILocation *DL,
-                                      BasicBlock *InsertBB,
-                                      Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
+                                    DIExpression *Expr, const DILocation *DL,
+                                    BasicBlock *InsertBB,
+                                    Instruction *InsertBefore) {
   assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.declare");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
+
+  if (M.IsNewDbgInfoFormat) {
+    DPValue *DPV = DPValue::createDPVDeclare(Storage, VarInfo, Expr, DL);
+    insertDPValue(DPV, InsertBB, InsertBefore);
+    return DPV;
+  }
+
   if (!DeclareFn)
     DeclareFn = getDeclareIntrin(M);
 
@@ -1055,6 +1081,23 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return B.CreateCall(DeclareFn, Args);
 }
 
+void DIBuilder::insertDPValue(DPValue *DPV, BasicBlock *InsertBB,
+                              Instruction *InsertBefore, bool InsertAtHead) {
+  assert(InsertBefore || InsertBB);
+  trackIfUnresolved(DPV->getVariable());
+  trackIfUnresolved(DPV->getExpression());
+  if (DPV->isDbgAssign())
+    trackIfUnresolved(DPV->getAddressExpression());
+
+  BasicBlock::iterator InsertPt;
+  if (InsertBB && InsertBefore)
+    InsertPt = InsertBefore->getIterator();
+  else if (InsertBB)
+    InsertPt = InsertBB->end();
+  InsertPt.setHeadBit(InsertAtHead);
+  InsertBB->insertDPValueBefore(DPV, InsertPt);
+}
+
 Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
                                            Value *V, DILocalVariable *VarInfo,
                                            DIExpression *Expr,
@@ -1081,18 +1124,28 @@ Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
   return B.CreateCall(IntrinsicFn, Args);
 }
 
-Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
-                                    BasicBlock *InsertBB,
-                                    Instruction *InsertBefore) {
+DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
+                                  BasicBlock *InsertBB,
+                                  Instruction *InsertBefore) {
   assert(LabelInfo && "empty or invalid DILabel* passed to dbg.label");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
              LabelInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
+
+  trackIfUnresolved(LabelInfo);
+  if (M.IsNewDbgInfoFormat) {
+    DPLabel *DPL = new DPLabel(LabelInfo, DL);
+    if (InsertBB && InsertBefore)
+      InsertBB->insertDPValueBefore(DPL, InsertBefore->getIterator());
+    else if (InsertBB)
+      InsertBB->insertDPValueBefore(DPL, InsertBB->end());
+    return DPL;
+  }
+
   if (!LabelFn)
     LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label);
 
-  trackIfUnresolved(LabelInfo);
   Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
 
   IRBuilder<> B(DL->getContext());
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 1f3ff2246a445..68fd244e25697 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1663,43 +1663,47 @@ LLVMValueRef
 LLVMDIBuilderInsertDeclareBefore(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
                                  LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
                                  LLVMMetadataRef DL, LLVMValueRef Instr) {
-  return wrap(unwrap(Builder)->insertDeclare(
-                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
-                  unwrap<Instruction>(Instr)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDeclareAtEnd(
-    LLVMDIBuilderRef Builder, LLVMValueRef Storage, LLVMMetadataRef VarInfo,
-    LLVMMetadataRef Expr, LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
-  return wrap(unwrap(Builder)->insertDeclare(
-                  unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
-                  unwrap(Block)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(LLVMDIBuilderRef Builder,
-                                               LLVMValueRef Val,
-                                               LLVMMetadataRef VarInfo,
-                                               LLVMMetadataRef Expr,
-                                               LLVMMetadataRef DebugLoc,
-                                               LLVMValueRef Instr) {
-  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
-                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
-                  unwrap<Instruction>(Instr)));
-}
-
-LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(LLVMDIBuilderRef Builder,
-                                              LLVMValueRef Val,
-                                              LLVMMetadataRef VarInfo,
-                                              LLVMMetadataRef Expr,
-                                              LLVMMetadataRef DebugLoc,
-                                              LLVMBasicBlockRef Block) {
-  return wrap(unwrap(Builder)->insertDbgValueIntrinsic(
-                  unwrap(Val), unwrap<DILocalVariable>(VarInfo),
-                  unwrap<DIExpression>(Expr), unwrap<DILocation>(DebugLoc),
-                  unwrap(Block)));
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
+      unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+      unwrap<DIExpression>(Expr), unwrap<DILocation>(DL),
+      unwrap<Instruction>(Instr));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef
+LLVMDIBuilderInsertDeclareAtEnd(LLVMDIBuilderRef Builder, LLVMValueRef Storage,
+                                LLVMMetadataRef VarInfo, LLVMMetadataRef Expr,
+                                LLVMMetadataRef DL, LLVMBasicBlockRef Block) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDeclare(
+      unwrap(Storage), unwrap<DILocalVariable>(VarInfo),
+      unwrap<DIExpression>(Expr), unwrap<DILocation>(DL), unwrap(Block));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueBefore(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMValueRef Instr) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
+      unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
+      unwrap<DILocation>(DebugLoc), unwrap<Instruction>(Instr));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
+}
+
+LLVMValueRef LLVMDIBuilderInsertDbgValueAtEnd(
+    LLVMDIBuilderRef Builder, LLVMValueRef Val, LLVMMetadataRef VarInfo,
+    LLVMMetadataRef Expr, LLVMMetadataRef DebugLoc, LLVMBasicBlockRef Block) {
+  DbgInstPtr DbgInst = unwrap(Builder)->insertDbgValueIntrinsic(
+      unwrap(Val), unwrap<DILocalVariable>(VarInfo), unwrap<DIExpression>(Expr),
+      unwrap<DILocation>(DebugLoc), unwrap(Block));
+  assert(isa<Instruction *>(DbgInst) &&
+         "Inserted a DbgRecord into function using old debug info mode");
+  return wrap(cast<Instruction *>(DbgInst));
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateAutoVariable(
@@ -2115,10 +2119,15 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
     return;
   }
-  auto *Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr,
-                                     Dest, AddrExpr, VarRec.DL);
+  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
+                                    AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+  LLVM_DEBUG(if (!Assign.isNull()) {
+    if (Assign.is<DbgRecord *>())
+      errs() << " > INSERT: " << *Assign.get<DbgRecord *>() << "\n";
+    else
+      errs() << " > INSERT: " << *Assign.get<Instruction *>() << "\n";
+  });
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 056e4f31981a7..d22e1c1231118 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -700,6 +700,10 @@ Attribute Function::getFnAttribute(StringRef Kind) const {
   return AttributeSets.getFnAttr(Kind);
 }
 
+Attribute Function::getRetAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.getRetAttr(Kind);
+}
+
 uint64_t Function::getFnAttributeAsParsedInteger(StringRef Name,
                                                  uint64_t Default) const {
   Attribute A = getFnAttribute(Name);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e863ef3eb8d6d..6b8c6e0c85ed9 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -166,7 +166,8 @@ void Instruction::insertBefore(BasicBlock &BB,
   }
 
   // If we're inserting a terminator, check if we need to flush out
-  // TrailingDPValues.
+  // TrailingDPValues. Inserting instructions at the end of an incomplete
+  // block is handled by the code block above.
   if (isTerminator())
     getParent()->flushTerminatorDbgValues();
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index cbbbec0ccc8c4..cb892e30c4a0b 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -299,9 +299,7 @@ static cl::opt<bool> UseLoopVersioningLICM(
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
 
 namespace llvm {
-cl::opt<bool> EnableMemProfContextDisambiguation(
-    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
-    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
 
 extern cl::opt<bool> EnableInferAlignmentPass;
 } // namespace llvm
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 74d857457aec1..c33c3680825a1 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -343,7 +343,7 @@ KnownBits KnownBits::shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW,
 }
 
 KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero) {
+                          bool ShAmtNonZero, bool Exact) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
@@ -367,6 +367,18 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
   // Find the common bits from all possible shifts.
   APInt MaxValue = RHS.getMaxValue();
   unsigned MaxShiftAmount = getMaxShiftAmount(MaxValue, BitWidth);
+
+  // If exact, bound MaxShiftAmount to first known 1 in LHS.
+  if (Exact) {
+    unsigned FirstOne = LHS.countMaxTrailingZeros();
+    if (FirstOne < MinShiftAmount) {
+      // Always poison. Return zero because we don't like returning conflict.
+      Known.setAllZero();
+      return Known;
+    }
+    MaxShiftAmount = std::min(MaxShiftAmount, FirstOne);
+  }
+
   unsigned ShiftAmtZeroMask = RHS.Zero.zextOrTrunc(32).getZExtValue();
   unsigned ShiftAmtOneMask = RHS.One.zextOrTrunc(32).getZExtValue();
   Known.Zero.setAllBits();
@@ -389,7 +401,7 @@ KnownBits KnownBits::lshr(const KnownBits &LHS, const KnownBits &RHS,
 }
 
 KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS,
-                          bool ShAmtNonZero) {
+                          bool ShAmtNonZero, bool Exact) {
   unsigned BitWidth = LHS.getBitWidth();
   auto ShiftByConst = [&](const KnownBits &LHS, unsigned ShiftAmt) {
     KnownBits Known = LHS;
@@ -415,6 +427,18 @@ KnownBits KnownBits::ashr(const KnownBits &LHS, const KnownBits &RHS,
   // Find the common bits from all possible shifts.
   APInt MaxValue = RHS.getMaxValue();
   unsigned MaxShiftAmount = getMaxShiftAmount(MaxValue, BitWidth);
+
+  // If exact, bound MaxShiftAmount to first known 1 in LHS.
+  if (Exact) {
+    unsigned FirstOne = LHS.countMaxTrailingZeros();
+    if (FirstOne < MinShiftAmount) {
+      // Always poison. Return zero because we don't like returning conflict.
+      Known.setAllZero();
+      return Known;
+    }
+    MaxShiftAmount = std::min(MaxShiftAmount, FirstOne);
+  }
+
   unsigned ShiftAmtZeroMask = RHS.Zero.zextOrTrunc(32).getZExtValue();
   unsigned ShiftAmtOneMask = RHS.One.zextOrTrunc(32).getZExtValue();
   Known.Zero.setAllBits();
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 23b1deb3697f6..bb268b2ba926c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -398,7 +398,7 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   if (GV->isTagged())
     return AArch64II::MO_GOT;
 
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
+  if (!TM.shouldAssumeDSOLocal(GV)) {
     if (GV->hasDLLImportStorageClass()) {
       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
     }
@@ -435,8 +435,7 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
   // NonLazyBind goes via GOT unless we know it's available locally.
   auto *F = dyn_cast<Function>(GV);
   if ((!isTargetMachO() || MachOUseNonLazyBind) && F &&
-      F->hasFnAttribute(Attribute::NonLazyBind) &&
-      !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+      F->hasFnAttribute(Attribute::NonLazyBind) && !TM.shouldAssumeDSOLocal(GV))
     return AArch64II::MO_GOT;
 
   if (getTargetTriple().isOSWindows()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 0f3c3cb96e6ce..7a49422c064b7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5934,13 +5934,16 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
 
   // Keep track of the last MI we inserted. Later on, we might be able to save
   // a copy using it.
-  MachineInstr *PrevMI = nullptr;
+  MachineInstr *PrevMI = ScalarToVec;
   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
     // Note that if we don't do a subregister copy, we can end up making an
     // extra register.
-    PrevMI = &*emitLaneInsert(std::nullopt, DstVec, I.getOperand(i).getReg(),
-                              i - 1, RB, MIB);
-    DstVec = PrevMI->getOperand(0).getReg();
+    Register OpReg = I.getOperand(i).getReg();
+    // Do not emit inserts for undefs
+    if (!getOpcodeDef<GImplicitDef>(OpReg, MRI)) {
+      PrevMI = &*emitLaneInsert(std::nullopt, DstVec, OpReg, i - 1, RB, MIB);
+      DstVec = PrevMI->getOperand(0).getReg();
+    }
   }
 
   // If DstTy's size in bits is less than 128, then emit a subregister copy
@@ -5973,11 +5976,27 @@ bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
     RegOp.setReg(Reg);
     RBI.constrainGenericRegister(DstReg, *RC, MRI);
   } else {
-    // We don't need a subregister copy. Save a copy by re-using the
-    // destination register on the final insert.
-    assert(PrevMI && "PrevMI was null?");
+    // We either have a vector with all elements (except the first one) undef or
+    // at least one non-undef non-first element. In the first case, we need to
+    // constrain the output register ourselves as we may have generated an
+    // INSERT_SUBREG operation which is a generic operation for which the
+    // output regclass cannot be automatically chosen.
+    //
+    // In the second case, there is no need to do this as it may generate an
+    // instruction like INSvi32gpr where the regclass can be automatically
+    // chosen.
+    //
+    // Also, we save a copy by re-using the destination register on the final
+    // insert.
     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
+
+    Register DstReg = PrevMI->getOperand(0).getReg();
+    if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
+      const TargetRegisterClass *RC =
+          getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
+      RBI.constrainGenericRegister(DstReg, *RC, MRI);
+    }
   }
 
   I.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7183148e13103..c877658cd38e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1903,7 +1903,7 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
 
 def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
-def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
+def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()">;
 
 // Control use of True16 instructions. The real True16 instructions are
 // True16 instructions as they are defined in the ISA. Fake True16
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index a1bbe170ee29a..c7091028b3b5e 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2691,9 +2691,8 @@ defm BUFFER_LOAD_SBYTE_D16        : MUBUF_Real_AllAddr_gfx10<0x022>;
 defm BUFFER_LOAD_SBYTE_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x023>;
 defm BUFFER_LOAD_SHORT_D16        : MUBUF_Real_AllAddr_gfx10<0x024>;
 defm BUFFER_LOAD_SHORT_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x025>;
-// FIXME-GFX10: Add following instructions:
-//defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_gfx10<0x026>;
-//defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
+defm BUFFER_LOAD_FORMAT_D16_HI_X  : MUBUF_Real_AllAddr_gfx10<0x026>;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx10<0x027>;
 defm BUFFER_LOAD_FORMAT_D16_X     : MUBUF_Real_AllAddr_gfx10<0x080>;
 defm BUFFER_LOAD_FORMAT_D16_XY    : MUBUF_Real_AllAddr_gfx10<0x081>;
 defm BUFFER_LOAD_FORMAT_D16_XYZ   : MUBUF_Real_AllAddr_gfx10<0x082>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index a84227ebf506f..cc763df5a4760 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1210,13 +1210,13 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
 // GFX12.
 //===----------------------------------------------------------------------===//
 
-multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME)> {
+multiclass DS_Real_gfx12<bits<8> op, string name = !tolower(NAME), bit needAlias = true> {
   defvar ps = !cast<DS_Pseudo>(NAME);
   let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
                                                name, /*hasGDS=*/false>;
-  if !ne(ps.Mnemonic, name) then
+  if !and(needAlias, !ne(ps.Mnemonic, name)) then
     def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX12Plus]>;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1889ab0072880..9bc1b8eb598f3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6219,8 +6219,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
   // address space for functions to avoid the explicit check.
   return (GV->getValueType()->isFunctionTy() ||
           !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
-         !shouldEmitFixup(GV) &&
-         !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+         !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
 }
 
 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8a92aa8228f12..f136a434971c8 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -199,7 +199,7 @@ multiclass VOP2Inst_t16<string opName,
                         SDPatternOperator node = null_frag,
                         string revOp = opName,
                         bit GFX9Renamed = 0> {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
   }
   let SubtargetPredicate = UseRealTrue16Insts in {
@@ -219,7 +219,7 @@ multiclass VOP2Inst_e64_t16<string opName,
                         SDPatternOperator node = null_frag,
                         string revOp = opName,
                         bit GFX9Renamed = 0> {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
   }
   let SubtargetPredicate = HasTrue16BitInsts in {
@@ -900,7 +900,7 @@ def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
 
 let isReMaterializable = 1 in {
 let FPDPRounding = 1 in {
-  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in
     defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
   let SubtargetPredicate = HasTrue16BitInsts in
     defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
@@ -950,7 +950,7 @@ let SubtargetPredicate = isGFX11Plus in {
 } // End SubtargetPredicate = isGFX11Plus
 
 let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
@@ -958,7 +958,7 @@ def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">
 }
 
 let isCommutable = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
@@ -971,7 +971,7 @@ let Constraints = "$vdst = $src2",
     DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1,
     isCommutable = 1 in {
-let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
+let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
 defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
 }
 let SubtargetPredicate = HasTrue16BitInsts in {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index e5e82447d55fb..022fb7cb67754 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -408,7 +408,7 @@ def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
 
 multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
                      string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -424,7 +424,7 @@ multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string r
 
 multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
                      string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -439,7 +439,7 @@ multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string r
   VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Pseudos <opName, VOPC_I1_F16_F16, VOPC_F16_F16, COND_NULL, revOp>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -454,7 +454,7 @@ multiclass VOPCX_F64 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_F64_F64, VOPC_F64_F64, COND_NULL, revOp>;
 
 multiclass VOPCX_I16<string opName, string revOp = opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Pseudos <opName, VOPC_I1_I16_I16, VOPC_I16_I16, COND_NULL, revOp>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -940,7 +940,7 @@ def VOPC_F32_I32 : VOPC_Class_NoSdst_Profile<[Write32Bit], f32>;
 def VOPC_F64_I32 : VOPC_Class_NoSdst_Profile<[Write64Bit], f64>;
 
 multiclass VOPC_CLASS_F16 <string opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPC_Class_Pseudos <opName, VOPC_I1_F16_I16, 0>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
@@ -949,7 +949,7 @@ multiclass VOPC_CLASS_F16 <string opName> {
 }
 
 multiclass VOPCX_CLASS_F16 <string opName> {
-  let OtherPredicates = [NotHasTrue16BitInsts, Has16BitInsts]  in {
+  let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
     defm NAME : VOPCX_Class_Pseudos <opName, VOPC_I1_F16_I16, VOPC_F16_I16>;
   }
   let OtherPredicates = [HasTrue16BitInsts] in {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index dc81178311b6d..7ac49782ea846 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2655,12 +2655,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isDirect = false;
 
   const TargetMachine &TM = getTargetMachine();
-  const Module *Mod = MF.getFunction().getParent();
   const GlobalValue *GVal = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     GVal = G->getGlobal();
-  bool isStub =
-      !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
+  bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
 
   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
@@ -2737,7 +2735,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned TargetFlags = ARMII::MO_NO_FLAG;
         if (GVal->hasDLLImportStorageClass())
           TargetFlags = ARMII::MO_DLLIMPORT;
-        else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
+        else if (!TM.shouldAssumeDSOLocal(GVal))
           TargetFlags = ARMII::MO_COFFSTUB;
         Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
                                             TargetFlags);
@@ -4021,7 +4019,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
   ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
   if (GV->hasDLLImportStorageClass())
     TargetFlags = ARMII::MO_DLLIMPORT;
-  else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  else if (!TM.shouldAssumeDSOLocal(GV))
     TargetFlags = ARMII::MO_COFFSTUB;
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 717e61518c6ee..04ba20a17187b 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -353,7 +353,7 @@ bool ARMSubtarget::isRWPI() const {
 }
 
 bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (!TM.shouldAssumeDSOLocal(GV))
     return true;
 
   // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index 90f70b83a02d3..869277a391a56 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -649,8 +649,7 @@ SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = S->getGlobal();
-    bool IsLocal =
-        getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(GV);
 
     if (isPositionIndependent() || !Subtarget.has2E3()) {
       IsRegCall = true;
@@ -662,8 +661,7 @@ SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI,
           cast<GlobalAddressSDNode>(Callee), Ty, DAG, CSKYII::MO_None));
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(
-        *MF.getFunction().getParent(), nullptr);
+    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(nullptr);
 
     if (isPositionIndependent() || !Subtarget.has2E3()) {
       IsRegCall = true;
@@ -1153,7 +1151,7 @@ SDValue CSKYTargetLowering::LowerGlobalAddress(SDValue Op,
   int64_t Offset = N->getOffset();
 
   const GlobalValue *GV = N->getGlobal();
-  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(GV);
   SDValue Addr = getAddr<GlobalAddressSDNode, false>(N, DAG, IsLocal);
 
   // In order to maximise the opportunity for common subexpression elimination,
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index eda1150835a1f..41462cceef51d 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1238,7 +1238,7 @@ HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA);
   }
 
-  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool UsePCRel = getTargetMachine().shouldAssumeDSOLocal(GV);
   if (UsePCRel) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset,
                                             HexagonII::MO_PCREL);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2d71423d6dd59..c87f5341d7fea 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -4251,14 +4251,12 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // split it and then direct call can be matched by PseudoCALL.
   if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = S->getGlobal();
-    unsigned OpFlags =
-        getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)
-            ? LoongArchII::MO_CALL
-            : LoongArchII::MO_CALL_PLT;
+    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(GV)
+                           ? LoongArchII::MO_CALL
+                           : LoongArchII::MO_CALL_PLT;
     Callee = DAG.getTargetGlobalAddress(S->getGlobal(), DL, PtrVT, 0, OpFlags);
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(
-                           *MF.getFunction().getParent(), nullptr)
+    unsigned OpFlags = getTargetMachine().shouldAssumeDSOLocal(nullptr)
                            ? LoongArchII::MO_CALL
                            : LoongArchII::MO_CALL_PLT;
     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags);
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp
index 86e81cd08ea26..3af1e994c01cd 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.cpp
+++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -175,7 +175,7 @@ M68kSubtarget::classifyLocalReference(const GlobalValue *GV) const {
 }
 
 unsigned char M68kSubtarget::classifyExternalReference(const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, nullptr))
+  if (TM.shouldAssumeDSOLocal(nullptr))
     return classifyLocalReference(nullptr);
 
   if (isPositionIndependent())
@@ -191,7 +191,7 @@ M68kSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
 
 unsigned char M68kSubtarget::classifyGlobalReference(const GlobalValue *GV,
                                                      const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return classifyLocalReference(GV);
 
   switch (TM.getCodeModel()) {
@@ -240,7 +240,7 @@ unsigned char
 M68kSubtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
                                                const Module &M) const {
   // local always use pc-rel referencing
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return M68kII::MO_NO_FLAG;
 
   // If the function is marked as non-lazy, generate an indirect call
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c979c03dc1b83..c411c8ef9528d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6100,6 +6100,9 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   if (AI->isFloatingPointOperation()) {
     if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
+      if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
+          STI.getPTXVersion() >= 63)
+        return AtomicExpansionKind::None;
       if (Ty->isFloatTy())
         return AtomicExpansionKind::None;
       if (Ty->isDoubleTy() && STI.hasAtomAddF64())
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 477789a164ead..869b13369e87e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1630,6 +1630,13 @@ defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".u64", ".add",
 defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".u64",
   ".add", atomic_load_add_64_gen, i64imm, imm>;
 
+defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2<f16, Int16Regs, ".global", ".f16", ".add.noftz",
+  atomic_load_add_g, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2<f16, Int16Regs, ".shared", ".f16", ".add.noftz",
+  atomic_load_add_s, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2<f16, Int16Regs, "", ".f16", ".add.noftz",
+  atomic_load_add_gen, f16imm, fpimm, [hasSM<70>, hasPTX<63>]>;
+
 defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<f32, Float32Regs, ".global", ".f32", ".add",
   atomic_load_add_g, f32imm, fpimm>;
 defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<f32, Float32Regs, ".shared", ".f32", ".add",
@@ -2007,6 +2014,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
                        SDNode Imm, ValueType ImmTy,
                        list<Predicate> Preds> {
   let AddedComplexity = 1 in {
+    def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                      (ins Int16Regs:$src, regclass:$b),
+                      (Intr (i16 Int16Regs:$src), (regT regclass:$b))>;
     def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                       (ins Int32Regs:$src, regclass:$b),
                       (Intr (i32 Int32Regs:$src), (regT regclass:$b))>;
@@ -2017,6 +2027,9 @@ multiclass ATOM2P_impl<string AsmStr,  Intrinsic Intr,
   // tablegen can't infer argument types from Intrinsic (though it can
   // from Instruction) so we have to enforce specific type on
   // immediates via explicit cast to ImmTy.
+  def : ATOM23_impl<AsmStr, regT, regclass, Preds,
+                    (ins Int16Regs:$src, ImmType:$b),
+                    (Intr (i16 Int16Regs:$src), (ImmTy Imm:$b))>;
   def : ATOM23_impl<AsmStr, regT, regclass, Preds,
                     (ins Int32Regs:$src, ImmType:$b),
                     (Intr (i32 Int32Regs:$src), (ImmTy Imm:$b))>;
@@ -2136,6 +2149,8 @@ multiclass ATOM2_add_impl<string OpStr> {
    defm _s32  : ATOM2S_impl<OpStr, "i", "s32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u32  : ATOM2S_impl<OpStr, "i", "u32", i32, Int32Regs, i32imm, imm, i32, []>;
    defm _u64  : ATOM2S_impl<OpStr, "i", "u64", i64, Int64Regs, i64imm, imm, i64, []>;
+   defm _f16  : ATOM2S_impl<OpStr, "f", "f16", f16, Int16Regs, f16imm, fpimm, f16,
+                            [hasSM<70>, hasPTX<63>]>;
    defm _f32  : ATOM2S_impl<OpStr, "f", "f32", f32, Float32Regs, f32imm, fpimm, f32,
                             []>;
    defm _f64  : ATOM2S_impl<OpStr, "f", "f64", f64, Float64Regs, f64imm, fpimm, f64,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 68c80dd9aa5c7..aef2d483c6df1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -4818,7 +4818,7 @@ static bool callsShareTOCBase(const Function *Caller,
   // If the callee is preemptable, then the static linker will use a plt-stub
   // which saves the toc to the stack, and needs a nop after the call
   // instruction to convert to a toc-restore.
-  if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), CalleeGV))
+  if (!TM.shouldAssumeDSOLocal(CalleeGV))
     return false;
 
   // Functions with PC Relative enabled may clobber the TOC in the same DSO.
@@ -5420,10 +5420,9 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
   // Returns true if the callee is local, and false otherwise.
   auto isLocalCallee = [&]() {
     const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-    const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
     const GlobalValue *GV = G ? G->getGlobal() : nullptr;
 
-    return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
+    return DAG.getTarget().shouldAssumeDSOLocal(GV) &&
            !isa_and_nonnull<GlobalIFunc>(GV);
   };
 
@@ -18045,7 +18044,7 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
       return false;
 
   // If the function is local then we have a good chance at tail-calling it
-  return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
+  return getTargetMachine().shouldAssumeDSOLocal(Callee);
 }
 
 bool PPCTargetLowering::
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 2735bdee3bcfc..5380ec1c4c0d9 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -189,7 +189,7 @@ bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
     return true;
-  if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return false;
   return true;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 491bff7f3c30f..d0badd3692e40 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -122,7 +122,7 @@ bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
 
   // For the small model, all locally-binding symbols are in range.
   if (CM == CodeModel::Small)
-    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+    return TLInfo.getTargetMachine().shouldAssumeDSOLocal(GV);
 
   // For Medium and above, assume that the symbol is not within the 4GB range.
   // Taking the address of locally-defined text would be OK, but that
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 4258a76b54b92..8b177a89c9192 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -160,8 +160,7 @@ static TLSModel::Model getSelectedTLSModel(const GlobalValue *GV) {
   llvm_unreachable("invalid TLS model");
 }
 
-bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
-                                         const GlobalValue *GV) const {
+bool TargetMachine::shouldAssumeDSOLocal(const GlobalValue *GV) const {
   const Triple &TT = getTargetTriple();
   Reloc::Model RM = getRelocationModel();
 
@@ -225,7 +224,7 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
   Reloc::Model RM = getRelocationModel();
   bool IsSharedLibrary = RM == Reloc::PIC_ && !IsPIE;
-  bool IsLocal = shouldAssumeDSOLocal(*GV->getParent(), GV);
+  bool IsLocal = shouldAssumeDSOLocal(GV);
 
   TLSModel::Model Model;
   if (IsSharedLibrary) {
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 0e41a2d7aa03e..96340f603a87e 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -648,12 +648,11 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // PC-relative references to external symbols should go through $stub.
   // If so, we need to prepare GlobalBaseReg first.
   const TargetMachine &TM = DAG.getTarget();
-  const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   const GlobalValue *GV = nullptr;
   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
   if (CalleeG)
     GV = CalleeG->getGlobal();
-  bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+  bool Local = TM.shouldAssumeDSOLocal(GV);
   bool UsePlt = !Local;
   MachineFunction &MF = DAG.getMachineFunction();
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790d1e351..905ff3b901842 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1683,7 +1683,7 @@ WebAssemblyTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   if (model == GlobalValue::LocalExecTLSModel ||
       model == GlobalValue::LocalDynamicTLSModel ||
       (model == GlobalValue::GeneralDynamicTLSModel &&
-       getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV))) {
+       getTargetMachine().shouldAssumeDSOLocal(GV))) {
     // For DSO-local TLS variables we use offset from __tls_base
 
     MVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -1729,7 +1729,7 @@ SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op,
   // need special treatment for tables in PIC mode.
   if (isPositionIndependent() &&
       !WebAssembly::isWebAssemblyTableType(GV->getValueType())) {
-    if (getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) {
+    if (getTargetMachine().shouldAssumeDSOLocal(GV)) {
       MachineFunction &MF = DAG.getMachineFunction();
       MVT PtrVT = getPointerTy(MF.getDataLayout());
       const char *BaseName;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a2a65ce75d6b9..8367f938c0ddf 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1237,8 +1237,6 @@ def ProcessorFeatures {
   // Gracemont
   list<SubtargetFeature> GRTTuning = [TuningMacroFusion,
                                       TuningSlow3OpsLEA,
-                                      TuningSlowDivide32,
-                                      TuningSlowDivide64,
                                       TuningFastScalarFSQRT,
                                       TuningFastVectorFSQRT,
                                       TuningFast15ByteNOP,
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 9f0b5f32df20a..48d3b68b1823a 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -2230,7 +2230,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     unsigned CmpOpcode =
       (RetVT == MVT::f32) ? X86::VCMPSSrri : X86::VCMPSDrri;
     unsigned BlendOpcode =
-      (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
+      (RetVT == MVT::f32) ? X86::VBLENDVPSrrr : X86::VBLENDVPDrrr;
 
     Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpRHSReg,
                                        CC);
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 4a542b7e5a1bb..69d45366a1dbc 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6266,27 +6266,27 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                                 X86MemOperand x86memop, ValueType VT,
                                 PatFrag mem_frag, SDNode OpNode,
                                 X86FoldableSchedWrite sched> {
-  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
-                  (ins RC:$src1, RC:$src2, RC:$src3),
-                  !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
-                  SSEPackedInt>, TA, PD, VEX, VVVV,
-                Sched<[sched]>;
+  def rrr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
+                   SSEPackedInt>, TA, PD, VEX, VVVV,
+                 Sched<[sched]>;
 
-  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
-                  (ins RC:$src1, x86memop:$src2, RC:$src3),
-                  !strconcat(OpcodeStr,
-                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                  [(set RC:$dst,
-                        (OpNode RC:$src3, (mem_frag addr:$src2),
-                                RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
-                Sched<[sched.Folded, sched.ReadAfterFold,
-                       // x86memop:$src2
-                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                       ReadDefault,
-                       // RC::$src3
-                       sched.ReadAfterFold]>;
+  def rmr : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst,
+                         (OpNode RC:$src3, (mem_frag addr:$src2),
+                                 RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
+                 Sched<[sched.Folded, sched.ReadAfterFold,
+                        // x86memop:$src2
+                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                        ReadDefault,
+                        // RC::$src3
+                        sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6320,16 +6320,16 @@ defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
                               (v4i32 VR128:$src2))),
-            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+            (VBLENDVPSrrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
                               (v2i64 VR128:$src2))),
-            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+            (VBLENDVPDrrr VR128:$src2, VR128:$src1, VR128:$mask)>;
   def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
                               (v8i32 VR256:$src2))),
-            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+            (VBLENDVPSYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
                               (v4i64 VR256:$src2))),
-            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+            (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>;
 }
 
 // Prefer a movss or movsd over a blendps when optimizing for size. these were
diff --git a/llvm/lib/Target/X86/X86SchedAlderlakeP.td b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
index 4dc5ea3c86112..6f9d2cf7ffdf4 100644
--- a/llvm/lib/Target/X86/X86SchedAlderlakeP.td
+++ b/llvm/lib/Target/X86/X86SchedAlderlakeP.td
@@ -2158,16 +2158,16 @@ def ADLPWriteResGroup244 : SchedWriteRes<[ADLPPort00_01_05, ADLPPort02_03_11]> {
   let Latency = 9;
   let NumMicroOps = 4;
 }
-def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rm$")>;
-def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrm)>;
+def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rmr$")>;
+def : InstRW<[ADLPWriteResGroup244, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrmr)>;
 
 def ADLPWriteResGroup245 : SchedWriteRes<[ADLPPort00_01_05]> {
   let ReleaseAtCycles = [3];
   let Latency = 3;
   let NumMicroOps = 3;
 }
-def : InstRW<[ADLPWriteResGroup245], (instregex "^VBLENDVP(D|S)rr$")>;
-def : InstRW<[ADLPWriteResGroup245], (instrs VPBLENDVBrr)>;
+def : InstRW<[ADLPWriteResGroup245], (instregex "^VBLENDVP(D|S)rrr$")>;
+def : InstRW<[ADLPWriteResGroup245], (instrs VPBLENDVBrrr)>;
 
 def ADLPWriteResGroup246 : SchedWriteRes<[ADLPPort00, ADLPPort01, ADLPPort02_03_11]> {
   let ReleaseAtCycles = [6, 7, 18];
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index c9ae9901ed5b8..3981279abc363 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -402,9 +402,9 @@ defm : ICXWriteResPair<WriteBlendZ,[ICXPort15], 1, [1], 1, 7>;
 defm : ICXWriteResPair<WriteVarBlend, [ICXPort015], 2, [2], 2, 6>; // Vector variable blends.
 defm : ICXWriteResPair<WriteVarBlendY,[ICXPort015], 2, [2], 2, 6>;
 defm : ICXWriteResPair<WriteVarBlendZ,[ICXPort05],  2, [1], 1, 6>;
-defm : ICXWriteResPair<WriteMPSAD,   [ICXPort5], 4, [2], 2, 6>; // Vector MPSAD.
-defm : ICXWriteResPair<WriteMPSADY,  [ICXPort5], 4, [2], 2, 7>;
-defm : ICXWriteResPair<WriteMPSADZ,  [ICXPort5], 4, [2], 2, 7>;
+defm : ICXWriteResPair<WriteMPSAD,   [ICXPort15,ICXPort5], 4, [1,1], 2, 6>; // Vector MPSAD.
+defm : ICXWriteResPair<WriteMPSADY,  [ICXPort15,ICXPort5], 4, [1,1], 2, 7>;
+defm : ICXWriteResPair<WriteMPSADZ,  [ICXPort15,ICXPort5], 4, [1,1], 2, 7>;
 defm : ICXWriteResPair<WritePSADBW,  [ICXPort5], 3, [1], 1, 5>; // Vector PSADBW.
 defm : ICXWriteResPair<WritePSADBWX, [ICXPort5], 3, [1], 1, 6>;
 defm : ICXWriteResPair<WritePSADBWY, [ICXPort5], 3, [1], 1, 7>;
diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
index 3c698d2c9f7a0..88bb9ad8f1d74 100644
--- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td
+++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td
@@ -2673,25 +2673,25 @@ def SPRWriteResGroup259 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let Latency = 10;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)Yrm$")>;
-def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBYrm)>;
+def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)Yrmr$")>;
+def : InstRW<[SPRWriteResGroup259, ReadAfterVecYLd, ReadAfterVecYLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBYrmr)>;
 
 def SPRWriteResGroup260 : SchedWriteRes<[SPRPort00_01_05]> {
   let ReleaseAtCycles = [3];
   let Latency = 3;
   let NumMicroOps = 3;
 }
-def : InstRW<[SPRWriteResGroup260], (instregex "^VBLENDVP(S|DY)rr$",
-                                               "^VBLENDVP(D|SY)rr$",
-                                               "^VPBLENDVB(Y?)rr$")>;
+def : InstRW<[SPRWriteResGroup260], (instregex "^VBLENDVP(S|DY)rrr$",
+                                               "^VBLENDVP(D|SY)rrr$",
+                                               "^VPBLENDVB(Y?)rrr$")>;
 
 def SPRWriteResGroup261 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let ReleaseAtCycles = [3, 1];
   let Latency = 9;
   let NumMicroOps = 4;
 }
-def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rm$")>;
-def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrm)>;
+def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instregex "^VBLENDVP(D|S)rmr$")>;
+def : InstRW<[SPRWriteResGroup261, ReadAfterVecXLd, ReadAfterVecXLd, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], (instrs VPBLENDVBrmr)>;
 
 def SPRWriteResGroup262 : SchedWriteRes<[SPRPort00_01_05, SPRPort02_03_11]> {
   let Latency = 9;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 07f535685e8f9..c2e6ddd7e7fa2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -140,7 +140,7 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     }
   }
 
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return classifyLocalReference(GV);
 
   if (isTargetCOFF()) {
@@ -190,7 +190,7 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV) const {
 unsigned char
 X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
                                               const Module &M) const {
-  if (TM.shouldAssumeDSOLocal(M, GV))
+  if (TM.shouldAssumeDSOLocal(GV))
     return X86II::MO_NO_FLAG;
 
   // Functions on COFF can be non-DSO local for three reasons:
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index e89ec353487ee..3aa8ea3f51471 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -653,10 +653,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   // check to see if the pointer is guaranteed to not be modified from entry of
   // the function to each of the load instructions.
 
-  // Because there could be several/many load instructions, remember which
-  // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
-
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
     // the load itself.
@@ -670,7 +666,7 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
     // To do this, we perform a depth first search on the inverse CFG from the
     // loading block.
     for (BasicBlock *P : predecessors(BB)) {
-      for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+      for (BasicBlock *TranspBB : inverse_depth_first(P))
         if (AAR.canBasicBlockModify(*TranspBB, Loc))
           return false;
     }
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 5c7a74dadb46a..68f9799616ae6 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -163,6 +163,10 @@ static cl::opt<std::string> WorkloadDefinitions(
              "}"),
     cl::Hidden);
 
+namespace llvm {
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
+}
+
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
                                         LLVMContext &Context) {
@@ -1643,7 +1647,9 @@ Expected<bool> FunctionImporter::importFunctions(
       if (Import) {
         if (Error Err = F.materialize())
           return std::move(Err);
-        if (EnableImportMetadata) {
+        // MemProf should match function's definition and summary,
+        // 'thinlto_src_module' is needed.
+        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
           // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
           // statistics and debugging.
           F.setMetadata(
@@ -1693,7 +1699,7 @@ Expected<bool> FunctionImporter::importFunctions(
         LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << GO->getGUID() << " "
                           << GO->getName() << " from "
                           << SrcModule->getSourceFileName() << "\n");
-        if (EnableImportMetadata) {
+        if (EnableImportMetadata || EnableMemProfContextDisambiguation) {
           // Add 'thinlto_src_module' and 'thinlto_src_file' metadata for
           // statistics and debugging.
           Fn->setMetadata(
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 271d3ed40030b..ba5e3b637db75 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -122,6 +122,10 @@ static cl::opt<unsigned>
                                  "frames through tail calls."));
 
 namespace llvm {
+cl::opt<bool> EnableMemProfContextDisambiguation(
+    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
+
 // Indicate we are linking with an allocator that supports hot/cold operator
 // new interfaces.
 cl::opt<bool> SupportsHotColdNew(
@@ -3375,10 +3379,22 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
 
     auto *GVSummary =
         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
-    if (!GVSummary)
-      // Must have been imported, use the first summary (might be multiple if
-      // this was a linkonce_odr).
-      GVSummary = TheFnVI.getSummaryList().front().get();
+    if (!GVSummary) {
+      // Must have been imported, use the summary which matches the definition。
+      // (might be multiple if this was a linkonce_odr).
+      auto SrcModuleMD = F.getMetadata("thinlto_src_module");
+      assert(SrcModuleMD &&
+             "enable-import-metadata is needed to emit thinlto_src_module");
+      StringRef SrcModule =
+          dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
+      for (auto &GVS : TheFnVI.getSummaryList()) {
+        if (GVS->modulePath() == SrcModule) {
+          GVSummary = GVS.get();
+          break;
+        }
+      }
+      assert(GVSummary && GVSummary->modulePath() == SrcModule);
+    }
 
     // If this was an imported alias skip it as we won't have the function
     // summary, and it should be cloned in the original module.
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 422406e46bdb0..11a5c29c35f70 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -317,6 +317,8 @@ class HWAddressSanitizer {
     Value *MemTag = nullptr;
   };
 
+  bool selectiveInstrumentationShouldSkip(Function &F,
+                                          FunctionAnalysisManager &FAM);
   void initializeModule();
   void createHwasanCtorComdat();
 
@@ -1523,6 +1525,31 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
   return true;
 }
 
+bool HWAddressSanitizer::selectiveInstrumentationShouldSkip(
+    Function &F, FunctionAnalysisManager &FAM) {
+  if (ClRandomSkipRate.getNumOccurrences()) {
+    std::bernoulli_distribution D(ClRandomSkipRate);
+    if (D(*Rng))
+      return true;
+  } else {
+    auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+    ProfileSummaryInfo *PSI =
+        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+    if (PSI && PSI->hasProfileSummary()) {
+      auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+      if ((ClHotPercentileCutoff.getNumOccurrences() &&
+           ClHotPercentileCutoff >= 0)
+              ? PSI->isFunctionHotInCallGraphNthPercentile(
+                    ClHotPercentileCutoff, &F, BFI)
+              : PSI->isFunctionHotInCallGraph(&F, BFI))
+        return true;
+    } else {
+      ++NumNoProfileSummaryFuncs;
+    }
+  }
+  return false;
+}
+
 void HWAddressSanitizer::sanitizeFunction(Function &F,
                                           FunctionAnalysisManager &FAM) {
   if (&F == HwasanCtorFunction)
@@ -1535,28 +1562,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
     return;
 
   NumTotalFuncs++;
-  if (CSelectiveInstrumentation) {
-    if (ClRandomSkipRate.getNumOccurrences()) {
-      std::bernoulli_distribution D(ClRandomSkipRate);
-      if (D(*Rng))
-        return;
-    } else {
-      auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-      ProfileSummaryInfo *PSI =
-          MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-      if (PSI && PSI->hasProfileSummary()) {
-        auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-        if ((ClHotPercentileCutoff.getNumOccurrences() &&
-             ClHotPercentileCutoff >= 0)
-                ? PSI->isFunctionHotInCallGraphNthPercentile(
-                      ClHotPercentileCutoff, &F, BFI)
-                : PSI->isFunctionHotInCallGraph(&F, BFI))
-          return;
-      } else {
-        ++NumNoProfileSummaryFuncs;
-      }
-    }
-  }
+
+  if (CSelectiveInstrumentation && selectiveInstrumentationShouldSkip(F, FAM))
+    return;
+
   NumInstrumentedFuncs++;
 
   LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index e11b984f13bbc..190fee11618bf 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -324,23 +324,16 @@ static DebugVariable getAggregateVariable(DPValue *DPV) {
                        DPV->getDebugLoc().getInlinedAt());
 }
 
-static DPValue *createLinkedAssign(DPValue *, DIBuilder &DIB,
-                                   Instruction *LinkedInstr, Value *NewValue,
-                                   DILocalVariable *Variable,
-                                   DIExpression *Expression, Value *Address,
-                                   DIExpression *AddressExpression,
-                                   const DILocation *DI) {
-  (void)DIB;
-  return DPValue::createLinkedDPVAssign(LinkedInstr, NewValue, Variable,
-                                        Expression, Address, AddressExpression,
-                                        DI);
+/// Helpers for handling new and old debug info modes in migrateDebugInfo.
+/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
+/// on the \p Unused parameter type.
+DPValue *UnwrapDbgInstPtr(DbgInstPtr P, DPValue *Unused) {
+  (void)Unused;
+  return static_cast<DPValue *>(cast<DbgRecord *>(P));
 }
-static DbgAssignIntrinsic *createLinkedAssign(
-    DbgAssignIntrinsic *, DIBuilder &DIB, Instruction *LinkedInstr,
-    Value *NewValue, DILocalVariable *Variable, DIExpression *Expression,
-    Value *Address, DIExpression *AddressExpression, const DILocation *DI) {
-  return DIB.insertDbgAssign(LinkedInstr, NewValue, Variable, Expression,
-                             Address, AddressExpression, DI);
+DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
+  (void)Unused;
+  return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
 }
 
 /// Find linked dbg.assign and generate a new one with the correct
@@ -398,7 +391,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
   assert(OldAlloca->isStaticAlloca());
 
-  auto MigrateDbgAssign = [&](auto DbgAssign) {
+  auto MigrateDbgAssign = [&](auto *DbgAssign) {
     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
                       << "\n");
     auto *Expr = DbgAssign->getExpression();
@@ -452,10 +445,12 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     }
 
     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
-    auto *NewAssign = createLinkedAssign(
-        DbgAssign, DIB, Inst, NewValue, DbgAssign->getVariable(), Expr, Dest,
-        DIExpression::get(Expr->getContext(), std::nullopt),
-        DbgAssign->getDebugLoc());
+    auto *NewAssign = UnwrapDbgInstPtr(
+        DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
+                            Dest,
+                            DIExpression::get(Expr->getContext(), std::nullopt),
+                            DbgAssign->getDebugLoc()),
+        DbgAssign);
 
     // If we've updated the value but the original dbg.assign has an arglist
     // then kill it now - we can't use the requested new value.
@@ -5031,9 +5026,11 @@ static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  auto *NewAssign = DIB.insertDbgAssign(
-      NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
-      Orig->getAddressExpression(), Orig->getDebugLoc());
+  Instruction *NewAssign =
+      DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(),
+                          NewFragmentExpr, NewAddr,
+                          Orig->getAddressExpression(), Orig->getDebugLoc())
+          .get<Instruction *>();
   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
   (void)NewAssign;
 }
@@ -5052,7 +5049,7 @@ static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
                          DIAssignID::getDistinct(NewAddr->getContext()));
   }
-  auto *NewAssign = DPValue::createLinkedDPVAssign(
+  DPValue *NewAssign = DPValue::createLinkedDPVAssign(
       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
       Orig->getAddressExpression(), Orig->getDebugLoc());
   LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n");
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d3bb89075015e..a44536e34c922 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1649,9 +1649,9 @@ static void insertDbgValueOrDPValue(DIBuilder &Builder, Value *DV,
                                     const DebugLoc &NewLoc,
                                     BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
-    auto *DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                   (Instruction *)nullptr);
-    DbgVal->insertBefore(Instr);
+    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
+                                                  (Instruction *)nullptr);
+    DbgVal.get<Instruction *>()->insertBefore(Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
     // DPValue directly instead of a dbg.value intrinsic.
@@ -1667,9 +1667,9 @@ static void insertDbgValueOrDPValueAfter(DIBuilder &Builder, Value *DV,
                                          const DebugLoc &NewLoc,
                                          BasicBlock::iterator Instr) {
   if (!UseNewDbgInfoFormat) {
-    auto *DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
-                                                   (Instruction *)nullptr);
-    DbgVal->insertAfter(&*Instr);
+    auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc,
+                                                  (Instruction *)nullptr);
+    DbgVal.get<Instruction *>()->insertAfter(&*Instr);
   } else {
     // RemoveDIs: if we're using the new debug-info format, allocate a
     // DPValue directly instead of a dbg.value intrinsic.
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 88b05aab8db4d..b462803bad38c 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -101,21 +101,20 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 
 namespace {
 
-static DPValue *createDebugValue(DIBuilder &DIB, Value *NewValue,
-                                 DILocalVariable *Variable,
-                                 DIExpression *Expression, const DILocation *DI,
-                                 DPValue *InsertBefore) {
+static void createDebugValue(DIBuilder &DIB, Value *NewValue,
+                             DILocalVariable *Variable,
+                             DIExpression *Expression, const DILocation *DI,
+                             DPValue *InsertBefore) {
+  // FIXME: Merge these two functions now that DIBuilder supports DPValues.
+  // We neeed the API to accept DPValues as an insert point for that to work.
   (void)DIB;
-  return DPValue::createDPValue(NewValue, Variable, Expression, DI,
-                                *InsertBefore);
+  DPValue::createDPValue(NewValue, Variable, Expression, DI, *InsertBefore);
 }
-static DbgValueInst *createDebugValue(DIBuilder &DIB, Value *NewValue,
-                                      DILocalVariable *Variable,
-                                      DIExpression *Expression,
-                                      const DILocation *DI,
-                                      Instruction *InsertBefore) {
-  return static_cast<DbgValueInst *>(DIB.insertDbgValueIntrinsic(
-      NewValue, Variable, Expression, DI, InsertBefore));
+static void createDebugValue(DIBuilder &DIB, Value *NewValue,
+                             DILocalVariable *Variable,
+                             DIExpression *Expression, const DILocation *DI,
+                             Instruction *InsertBefore) {
+  DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, InsertBefore);
 }
 
 /// Helper for updating assignment tracking debug info when promoting allocas.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
index 1a5a6ac08d404..106dc8c13a49f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-object-loop-varying-phi.ll
@@ -7,8 +7,13 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 define void @indirect_ptr_recurrences_read_write(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'indirect_ptr_recurrences_read_write'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndidrectUnsafe:
+; CHECK-NEXT:            %l = load i32, ptr %ptr.recur, align 4, !tbaa !4 ->
+; CHECK-NEXT:            store i32 %xor, ptr %ptr.recur, align 4, !tbaa !4
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/ValueTracking/knownbits-shift.ll b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
new file mode 100644
index 0000000000000..5cb355eff5a69
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/knownbits-shift.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i8 @simplify_lshr_with_exact(i8 %x) {
+; CHECK-LABEL: @simplify_lshr_with_exact(
+; CHECK-NEXT:    ret i8 2
+;
+  %shr = lshr exact i8 6, %x
+  %r = and i8 %shr, 2
+  ret i8 %r
+}
+
+define i8 @simplify_ashr_with_exact(i8 %x) {
+; CHECK-LABEL: @simplify_ashr_with_exact(
+; CHECK-NEXT:    ret i8 2
+;
+  %shr = ashr exact i8 -122, %x
+  %r = and i8 %shr, 2
+  ret i8 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ecad3f1151348..ac330918b430a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -616,6 +616,12 @@
 # DEBUG-NEXT: G_BRJT (opcode {{[0-9]+}}): 2 type indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_INSERT_SUBVECTOR (opcode {{[0-9]+}}): 2 type indices, 1 imm index
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_EXTRACT_SUBVECTOR (opcode {{[0-9]+}}): 1 type index, 1 imm index
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
index 5de97256fc85a..71a2bd2ddcc6e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-build-vector.mir
@@ -266,12 +266,8 @@ body:             |
     ; CHECK-LABEL: name: undef_elts_different_regbanks
     ; CHECK: liveins: $w0
     ; CHECK: %val:gpr32all = COPY $w0
-    ; CHECK: %undef:gpr32 = IMPLICIT_DEF
     ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
-    ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, %undef
-    ; CHECK: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSvi32gpr]], 2, %undef
-    ; CHECK: %bv:fpr128 = INSvi32gpr [[INSvi32gpr1]], 3, %undef
+    ; CHECK: %bv:fpr128 = INSERT_SUBREG [[DEF]], %val, %subreg.ssub
     ; CHECK: $q0 = COPY %bv
     ; CHECK: RET_ReallyLR implicit $q0
     %val:gpr(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
index 6e01723f49935..5f280ae2e3024 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-shufflevec-undef-mask-elt.mir
@@ -19,20 +19,18 @@ body:             |
     ; CHECK: liveins: $d0
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
     ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-    ; CHECK: [[DEF1:%[0-9]+]]:gpr32 = IMPLICIT_DEF
-    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[DEF]], %subreg.ssub
-    ; CHECK: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 1, [[DEF1]]
-    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.ssub
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSERT_SUBREG]].dsub
     ; CHECK: [[ADRP:%[0-9]+]]:gpr64common = ADRP target-flags(aarch64-page) %const.0
     ; CHECK: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[ADRP]], target-flags(aarch64-pageoff, aarch64-nc) %const.0
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
     ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.dsub
-    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[COPY1]], %subreg.dsub
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY1]], %subreg.dsub
     ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG2]], 0
-    ; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[LDRDui]], %subreg.dsub
+    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[LDRDui]], %subreg.dsub
     ; CHECK: [[TBLv16i8One:%[0-9]+]]:fpr128 = TBLv16i8One [[INSvi64lane]], [[INSERT_SUBREG3]]
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[TBLv16i8One]].dsub
     ; CHECK: $d0 = COPY [[COPY2]]
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 273bf559554c9..f47da47002fbc 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -77,7 +77,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    and w8, w8, w10
 ; CHECK-GI-NEXT:    orr w8, w9, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index a92ae39c69724..5c006508d284f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -79,7 +79,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    bic w8, w10, w8
 ; CHECK-GI-NEXT:    orr w8, w9, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %and = and <1 x i32> %C, %B
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index f2cad6631dc26..e00f70b94e3b4 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -252,7 +252,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    add w8, w8, w9
 ; CHECK-GI-NEXT:    eor w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -308,11 +307,6 @@ define <3 x i8> @abs_v3i8(<3 x i8> %a){
 ; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
 ; CHECK-GI-NEXT:    fmov s1, w2
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    abs v0.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/and-mask-removal.ll b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
index 17ff015970168..a8a59f1591268 100644
--- a/llvm/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/llvm/test/CodeGen/AArch64/and-mask-removal.ll
@@ -65,9 +65,8 @@ if.end:                                           ; preds = %if.then, %entry
 define zeroext i1 @test8_0(i8 zeroext %x)  align 2 {
 ; CHECK-LABEL: test8_0:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    add w8, w0, #74
-; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #236
+; CHECK-NEXT:    sub w8, w0, #182
+; CHECK-NEXT:    cmn w8, #20
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
 entry:
@@ -508,16 +507,17 @@ define i64 @pr58109(i8 signext %0) {
 define i64 @pr58109b(i8 signext %0, i64 %a, i64 %b) {
 ; CHECK-SD-LABEL: pr58109b:
 ; CHECK-SD:       ; %bb.0:
-; CHECK-SD-NEXT:    add w8, w0, #1
-; CHECK-SD-NEXT:    tst w8, #0xfe
-; CHECK-SD-NEXT:    csel x0, x1, x2, eq
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    sub w8, w8, #255
+; CHECK-SD-NEXT:    cmn w8, #254
+; CHECK-SD-NEXT:    csel x0, x1, x2, lo
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: pr58109b:
 ; CHECK-GI:       ; %bb.0:
-; CHECK-GI-NEXT:    add w8, w0, #1
-; CHECK-GI-NEXT:    and w8, w8, #0xff
-; CHECK-GI-NEXT:    cmp w8, #2
+; CHECK-GI-NEXT:    mov w8, #-255 ; =0xffffff01
+; CHECK-GI-NEXT:    add w8, w8, w0, uxtb
+; CHECK-GI-NEXT:    cmn w8, #254
 ; CHECK-GI-NEXT:    csel x0, x1, x2, lo
 ; CHECK-GI-NEXT:    ret
   %2 = add i8 %0, 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2112944cc8479..2bf5419e54830 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -373,11 +373,9 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ;
 ; CHECK-GI-LABEL: test_build_illegal:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.h v1[1], v0[0]
 ; CHECK-GI-NEXT:    mov s0, v0[3]
-; CHECK-GI-NEXT:    mov.h v1[2], v0[0]
-; CHECK-GI-NEXT:    mov.h v1[3], v0[0]
-; CHECK-GI-NEXT:    fmov d0, d1
+; CHECK-GI-NEXT:    mov.h v0[3], v0[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %val = extractelement <4 x i32> %in, i32 3
   %smallval = trunc i32 %val to i16
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index cc3d80008143c..d282bee81827f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1346,7 +1346,6 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 ; CHECK-GI-LABEL: scalar_to_vector.v2i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
@@ -1354,33 +1353,19 @@ define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov s0, w0
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ret
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i64:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v2i64:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmov d0, x0
-; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ret
   %b = insertelement <2 x i64> undef, i64 %a, i32 0
   ret <2 x i64> %b
 }
@@ -1900,14 +1885,6 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v0.b[5], v6.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[6], v7.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[7], v16.b[0]
-; CHECK-GI-NEXT:    mov v0.b[8], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[9], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[10], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[11], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[12], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[13], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[14], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[15], v0.b[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2123,10 +2100,6 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI131_0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-GI-NEXT:    mov v0.h[4], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[5], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2266,8 +2239,6 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI135_0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index a5551285f2788..bccfdb93d786f 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -21,7 +21,6 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #58712 // =0xe558
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
@@ -42,7 +41,6 @@ define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #712 // =0x2c8
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    zip1 v0.2s, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    rev32 v0.4h, v0.4h
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9b065accce914..f4221accfcbc5 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -137,7 +137,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fabs.ll b/llvm/test/CodeGen/AArch64/fabs.ll
index 7c13b49246d23..de108b0bc2b7a 100644
--- a/llvm/test/CodeGen/AArch64/fabs.ll
+++ b/llvm/test/CodeGen/AArch64/fabs.ll
@@ -160,21 +160,20 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fabs v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -183,7 +182,6 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index f8970dc9e8d5d..6913a62fb266c 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -186,26 +186,24 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fadd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
@@ -217,7 +215,6 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fadd_v7f16:
@@ -538,26 +535,24 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fsub_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsub v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fsub v1.4s, v2.4s, v3.4s
@@ -569,7 +564,6 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fsub_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 0f02784aaf32a..2d0b5574cdd7b 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -262,31 +262,28 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ;
 ; CHECK-GI-LABEL: v3f64_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
 ; CHECK-GI-NEXT:    // kill: def $d4 killed $d4 def $q4
-; CHECK-GI-NEXT:    fmov s16, w8
+; CHECK-GI-NEXT:    fcmp d2, d5
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    cset w9, mi
-; CHECK-GI-NEXT:    mov v16.s[1], w8
-; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    fmov d2, x9
 ; CHECK-GI-NEXT:    fcmgt v0.2d, v3.2d, v0.2d
-; CHECK-GI-NEXT:    mov v1.d[1], x8
-; CHECK-GI-NEXT:    mov v16.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    fmov s2, w8
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    mov v16.s[3], w8
+; CHECK-GI-NEXT:    neg v3.4s, v1.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov v2.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v16.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v16.4s
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v3.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
@@ -349,15 +346,13 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -429,15 +424,13 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -554,44 +547,40 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
 ; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
 ; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.16b, v4.16b
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v18.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v4.8h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -602,6 +591,7 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-LABEL: v7f16_half:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    mov w8, #15 // =0xf
+; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    fmov s4, w8
 ; CHECK-GI-FP16-NEXT:    mov w8, #65535 // =0xffff
 ; CHECK-GI-FP16-NEXT:    fmov s6, w8
@@ -619,11 +609,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT:    mov v5.h[7], v0.h[0]
-; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
-; CHECK-GI-FP16-NEXT:    mov v7.h[7], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
 ; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -1054,69 +1041,63 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #31 // =0x1f
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    ldr s18, [sp, #40]
-; CHECK-GI-NOFP16-NEXT:    fmov s17, w4
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp]
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    fmov w9, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w7
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmov s3, w8
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w9
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
-; CHECK-GI-NOFP16-NEXT:    ldr s2, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[1], v16.s[0]
-; CHECK-GI-NOFP16-NEXT:    ldr s16, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    fmov w9, s6
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #16]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s16
-; CHECK-GI-NOFP16-NEXT:    fcmgt v5.4s, v5.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    fmov s6, w0
-; CHECK-GI-NOFP16-NEXT:    neg v19.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[2], v18.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    mov v17.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    ushl v4.4s, v5.4s, v4.4s
-; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp, #16]
-; CHECK-GI-NOFP16-NEXT:    mov v2.s[3], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    fmov w8, s5
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    sshl v4.4s, v4.4s, v19.4s
-; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w8
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v4.16b, v3.16b
-; CHECK-GI-NOFP16-NEXT:    and v3.16b, v17.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    mov v6.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v6.16b, v7.16b
-; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v3.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    neg v18.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v17.s[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v4.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w4
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
 ; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
 ; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
 ; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
@@ -1127,65 +1108,60 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
-; CHECK-GI-FP16-NEXT:    fcmgt v5.8h, v1.8h, v0.8h
-; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
-; CHECK-GI-FP16-NEXT:    ldr s6, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s2, w10
-; CHECK-GI-FP16-NEXT:    ldr s1, [sp, #24]
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #32]
-; CHECK-GI-FP16-NEXT:    fmov s16, w0
-; CHECK-GI-FP16-NEXT:    ldr s17, [sp, #40]
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #8]
-; CHECK-GI-FP16-NEXT:    umov w8, v5.h[4]
-; CHECK-GI-FP16-NEXT:    umov w9, v5.h[5]
-; CHECK-GI-FP16-NEXT:    umov w11, v5.h[0]
-; CHECK-GI-FP16-NEXT:    umov w12, v5.h[1]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
-; CHECK-GI-FP16-NEXT:    mov v16.s[1], w1
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], v17.s[0]
-; CHECK-GI-FP16-NEXT:    fmov s3, w8
-; CHECK-GI-FP16-NEXT:    umov w8, v5.h[6]
-; CHECK-GI-FP16-NEXT:    fmov s0, w11
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
-; CHECK-GI-FP16-NEXT:    umov w10, v5.h[3]
-; CHECK-GI-FP16-NEXT:    mov v16.s[2], w2
-; CHECK-GI-FP16-NEXT:    mov v3.s[1], w9
-; CHECK-GI-FP16-NEXT:    umov w9, v5.h[2]
-; CHECK-GI-FP16-NEXT:    mov v0.s[1], w12
-; CHECK-GI-FP16-NEXT:    fmov s5, w4
-; CHECK-GI-FP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-FP16-NEXT:    fcmgt v1.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT:    mov w12, #31 // =0x1f
+; CHECK-GI-FP16-NEXT:    ldr s4, [sp]
+; CHECK-GI-FP16-NEXT:    fmov s2, w12
+; CHECK-GI-FP16-NEXT:    fmov s6, w0
+; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #8]
+; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
+; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[4]
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[0]
+; CHECK-GI-FP16-NEXT:    umov w11, v1.h[5]
+; CHECK-GI-FP16-NEXT:    umov w10, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w12
+; CHECK-GI-FP16-NEXT:    umov w13, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
+; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
+; CHECK-GI-FP16-NEXT:    fmov s3, w9
+; CHECK-GI-FP16-NEXT:    fmov s0, w8
+; CHECK-GI-FP16-NEXT:    umov w8, v1.h[6]
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w12
+; CHECK-GI-FP16-NEXT:    umov w9, v1.h[3]
+; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w11
+; CHECK-GI-FP16-NEXT:    mov v0.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov w10, #-1 // =0xffffffff
+; CHECK-GI-FP16-NEXT:    fmov s1, w10
+; CHECK-GI-FP16-NEXT:    neg v17.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-FP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    mov v0.s[2], w9
-; CHECK-GI-FP16-NEXT:    fmov s4, w8
-; CHECK-GI-FP16-NEXT:    mov v2.s[3], w8
-; CHECK-GI-FP16-NEXT:    mov v5.s[1], w5
-; CHECK-GI-FP16-NEXT:    fmov w9, s6
-; CHECK-GI-FP16-NEXT:    fmov s6, w7
+; CHECK-GI-FP16-NEXT:    fmov w8, s4
+; CHECK-GI-FP16-NEXT:    fmov s4, w7
+; CHECK-GI-FP16-NEXT:    mov v0.s[2], w13
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
-; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT:    mov v0.s[3], w10
-; CHECK-GI-FP16-NEXT:    mov v6.s[1], w9
-; CHECK-GI-FP16-NEXT:    neg v18.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[2], w6
-; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s7
+; CHECK-GI-FP16-NEXT:    fmov w8, s5
+; CHECK-GI-FP16-NEXT:    ldr s5, [sp, #16]
 ; CHECK-GI-FP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
-; CHECK-GI-FP16-NEXT:    ldr s3, [sp, #16]
+; CHECK-GI-FP16-NEXT:    fmov s3, w4
+; CHECK-GI-FP16-NEXT:    mov v0.s[3], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-FP16-NEXT:    mov v4.s[2], w8
+; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v17.4s
+; CHECK-GI-FP16-NEXT:    fmov w8, s5
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
-; CHECK-GI-FP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-FP16-NEXT:    eor v1.16b, v2.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    mov v3.s[2], w6
 ; CHECK-GI-FP16-NEXT:    mov v4.s[3], w8
-; CHECK-GI-FP16-NEXT:    fmov w8, s3
-; CHECK-GI-FP16-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    mov v6.s[3], w8
-; CHECK-GI-FP16-NEXT:    eor v3.16b, v2.16b, v4.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v5.16b, v2.16b
-; CHECK-GI-FP16-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-GI-FP16-NEXT:    bsl v0.16b, v16.16b, v6.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v7.16b, v1.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v3.16b, v2.16b
+; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v4.16b
 ; CHECK-GI-FP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 78fd38ca9f268..84376107679d8 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -162,8 +162,6 @@ define <3 x float> @copysign_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-NEXT:    mov v2.s[2], w9
 ; CHECK-GI-NEXT:    mov v3.s[2], w8
-; CHECK-GI-NEXT:    mov v2.s[3], w8
-; CHECK-GI-NEXT:    mov v3.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
@@ -223,8 +221,6 @@ define <7 x half> @copysign_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    mov v5.h[5], v3.h[0]
 ; CHECK-GI-NEXT:    mov v4.h[6], v2.h[0]
 ; CHECK-GI-NEXT:    mov v5.h[6], v3.h[0]
-; CHECK-GI-NEXT:    mov v4.h[7], v0.h[0]
-; CHECK-GI-NEXT:    mov v5.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v5.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 3b8a22a052b83..1c761ea083028 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -163,21 +163,20 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: ceil_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintp v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -186,7 +185,6 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: ceil_v7f16:
@@ -470,21 +468,20 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: floor_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintm v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -493,7 +490,6 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: floor_v7f16:
@@ -777,21 +773,20 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: nearbyint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frinti v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -800,7 +795,6 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: nearbyint_v7f16:
@@ -1084,21 +1078,20 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: roundeven_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintn v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1107,7 +1100,6 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: roundeven_v7f16:
@@ -1391,21 +1383,20 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: rint_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintx v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1414,7 +1405,6 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: rint_v7f16:
@@ -1698,21 +1688,20 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: round_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frinta v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -1721,7 +1710,6 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: round_v7f16:
@@ -2005,21 +1993,20 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: trunc_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    frintz v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -2028,7 +2015,6 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: trunc_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index e73124fbb595b..d73a5dc73eefc 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -186,25 +186,23 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fdiv_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fdiv v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fdiv v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v1.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
@@ -217,7 +215,6 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fdiv_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll
index e3c0ced79f07a..519a2978d8604 100644
--- a/llvm/test/CodeGen/AArch64/fexplog.ll
+++ b/llvm/test/CodeGen/AArch64/fexplog.ll
@@ -332,7 +332,6 @@ define <3 x float> @exp_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -703,7 +702,6 @@ define <7 x half> @exp_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -1591,7 +1589,6 @@ define <3 x float> @exp2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -1962,7 +1959,6 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -2850,7 +2846,6 @@ define <3 x float> @log_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -3221,7 +3216,6 @@ define <7 x half> @log_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -4109,7 +4103,6 @@ define <3 x float> @log2_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -4480,7 +4473,6 @@ define <7 x half> @log2_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -5368,7 +5360,6 @@ define <3 x float> @log10_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -5739,7 +5730,6 @@ define <7 x half> @log10_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index f0e946c139987..357d91960624b 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -334,41 +334,39 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: min_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmin v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmin v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.minimum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -415,41 +413,39 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: max_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmax v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmax v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.maximum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -666,26 +662,24 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmin v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmin v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmin v1.4s, v2.4s, v3.4s
@@ -697,7 +691,6 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -775,26 +768,24 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmax v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmax v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmax v1.4s, v2.4s, v3.4s
@@ -806,7 +797,6 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index cdf9973b49f46..61199f82615bb 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -334,41 +334,39 @@ define <7 x float> @min_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: min_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fminnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fminnm v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.minnum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -415,41 +413,39 @@ define <7 x float> @max_v7f32(<7 x float> %a, <7 x float> %b) {
 ;
 ; CHECK-GI-LABEL: max_v7f32:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr s16, [sp]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-GI-NEXT:    ldr s16, [sp]
-; CHECK-GI-NEXT:    ldr s17, [sp, #24]
+; CHECK-GI-NEXT:    ldr s17, [sp, #32]
 ; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-GI-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-GI-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-GI-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-GI-NEXT:    ldr s18, [sp, #32]
+; CHECK-GI-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #8]
+; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[1], v16.s[0]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    ldr s5, [sp, #40]
+; CHECK-GI-NEXT:    ldr s16, [sp, #24]
+; CHECK-GI-NEXT:    mov v16.s[1], v17.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
+; CHECK-GI-NEXT:    ldr s2, [sp, #40]
 ; CHECK-GI-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v17.s[2], v5.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [sp, #16]
+; CHECK-GI-NEXT:    mov v16.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[3], v3.s[0]
 ; CHECK-GI-NEXT:    mov v7.s[3], v1.s[0]
-; CHECK-GI-NEXT:    mov v4.s[3], v0.s[0]
-; CHECK-GI-NEXT:    mov v17.s[3], v0.s[0]
+; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v16.4s
 ; CHECK-GI-NEXT:    fmaxnm v0.4s, v0.4s, v7.4s
-; CHECK-GI-NEXT:    fmaxnm v4.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT:    mov s5, v4.s[1]
+; CHECK-GI-NEXT:    mov s6, v4.s[2]
+; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov s3, v0.s[3]
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-GI-NEXT:    mov s5, v4.s[1]
-; CHECK-GI-NEXT:    mov s6, v4.s[2]
-; CHECK-GI-NEXT:    // kill: def $s4 killed $s4 killed $q4
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call <7 x float> @llvm.maxnum.v7f32(<7 x float> %a, <7 x float> %b)
@@ -666,26 +662,24 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: min_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fminnm v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fminnm v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fminnm v1.4s, v2.4s, v3.4s
@@ -697,7 +691,6 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: min_v7f16:
@@ -775,26 +768,24 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-NOFP16-GI-LABEL: max_v7f16:
 ; CHECK-NOFP16-GI:       // %bb.0: // %entry
-; CHECK-NOFP16-GI-NEXT:    mov h2, v0.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[5]
-; CHECK-NOFP16-GI-NEXT:    mov h4, v1.h[4]
-; CHECK-NOFP16-GI-NEXT:    mov h5, v1.h[5]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-NOFP16-GI-NEXT:    mov h0, v0.h[6]
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-NOFP16-GI-NEXT:    mov h6, v1.h[4]
+; CHECK-NOFP16-GI-NEXT:    mov h7, v1.h[5]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    fmaxnm v2.4s, v2.4s, v3.4s
+; CHECK-NOFP16-GI-NEXT:    mov h3, v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-NOFP16-GI-NEXT:    fmaxnm v3.4s, v6.4s, v7.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-NOFP16-GI-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-NOFP16-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-NOFP16-GI-NEXT:    mov h5, v0.h[3]
-; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-NOFP16-GI-NEXT:    mov h4, v0.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fmaxnm v1.4s, v2.4s, v3.4s
@@ -806,7 +797,6 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    mov h1, v1.h[2]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-NOFP16-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-NOFP16-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: max_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 336c9705f399d..4b019b57d968d 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -254,35 +254,32 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fma_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v2.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v17.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v18.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v19.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h18, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v2.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v16.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmla v19.4s, v18.4s, v17.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v19.4s
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmla v5.4s, v4.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[1], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v18.h[2], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v6.4h
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v16.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v18.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmla v4.4s, v3.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v5.h[0]
@@ -293,7 +290,6 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fma_v7f16:
@@ -866,43 +862,40 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmuladd_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v4.4s, v7.4s, v16.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
@@ -911,7 +904,6 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmuladd_v7f16:
@@ -1368,43 +1360,40 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v4.4s, v7.4s, v16.4s
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v4.4s
-; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v3.4s, v4.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v5.4h
-; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v16.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v3.4h
+; CHECK-GI-NOFP16-NEXT:    mov h3, v2.h[4]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v7.4h
+; CHECK-GI-NOFP16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h1, v2.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v5.4s, v6.4s
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fadd v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v4.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v5.h[0]
@@ -1413,7 +1402,6 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 1f49601a18272..1f41f2385c335 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -186,26 +186,24 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fmul_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v7.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmul v3.4s, v6.4s, v7.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v2.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v4.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.4s, v2.4s, v3.4s
@@ -217,7 +215,6 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fmul_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fneg.ll b/llvm/test/CodeGen/AArch64/fneg.ll
index d5010cf360841..cc0f7d2fd6075 100644
--- a/llvm/test/CodeGen/AArch64/fneg.ll
+++ b/llvm/test/CodeGen/AArch64/fneg.ll
@@ -161,21 +161,20 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: fabs_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    fneg v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v4.h[0]
@@ -184,7 +183,6 @@ define <7 x half> @fabs_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: fabs_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll
index 86f7322f7c4ee..24a2451df4842 100644
--- a/llvm/test/CodeGen/AArch64/fpext.ll
+++ b/llvm/test/CodeGen/AArch64/fpext.ll
@@ -168,8 +168,6 @@ define <2 x float> @fpext_v2f16_v2f32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll
index 1dd5450c271cb..c2ad1aafd65fc 100644
--- a/llvm/test/CodeGen/AArch64/fpow.ll
+++ b/llvm/test/CodeGen/AArch64/fpow.ll
@@ -395,7 +395,6 @@ define <3 x float> @pow_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
@@ -856,7 +855,6 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll
index b496c7d15eef3..5dbcaa4a5fda1 100644
--- a/llvm/test/CodeGen/AArch64/fpowi.ll
+++ b/llvm/test/CodeGen/AArch64/fpowi.ll
@@ -370,7 +370,6 @@ define <3 x float> @powi_v3f32(<3 x float> %a, i32 %b) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -787,7 +786,6 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index facb89671056f..67190e8596c46 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2708,7 +2708,6 @@ define <3 x i16> @fptos_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -2730,7 +2729,6 @@ define <3 x i16> @fptou_v3f32_v3i16(<3 x float> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -3243,8 +3241,6 @@ define <2 x i64> @fptos_v2f16_v2i64(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2d, v0.2d
@@ -3292,8 +3288,6 @@ define <2 x i64> @fptou_v2f16_v2i64(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.2d, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2d, v0.2d
@@ -4996,8 +4990,6 @@ define <2 x i32> @fptos_v2f16_v2i32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NEXT:    ret
@@ -5019,8 +5011,6 @@ define <2 x i32> @fptou_v2f16_v2i32(<2 x half> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NEXT:    ret
@@ -5276,8 +5266,6 @@ define <2 x i16> @fptos_v2f16_v2i16(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5306,8 +5294,6 @@ define <2 x i16> @fptou_v2f16_v2i16(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5344,7 +5330,6 @@ define <3 x i16> @fptos_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5378,7 +5363,6 @@ define <3 x i16> @fptou_v3f16_v3i16(<3 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5756,8 +5740,6 @@ define <2 x i8> @fptos_v2f16_v2i8(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -5786,8 +5768,6 @@ define <2 x i8> @fptou_v2f16_v2i8(<2 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtzu v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 3efc98ab5fd53..9d0672d1c95ea 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -63,7 +63,6 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = fptrunc <3 x double> %a to <3 x float>
@@ -94,8 +93,6 @@ define <2 x half> @fptrunc_v2f64_v2f16(<2 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h0, d0
 ; CHECK-GI-NEXT:    fcvt h1, d1
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -121,7 +118,6 @@ define <3 x half> @fptrunc_v3f64_v3f16(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt h2, d2
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -167,13 +163,9 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll
index 03caf0a33eb45..1a10fd2f1cdc3 100644
--- a/llvm/test/CodeGen/AArch64/frem.ll
+++ b/llvm/test/CodeGen/AArch64/frem.ll
@@ -397,7 +397,6 @@ define <3 x float> @frem_v3f32(<3 x float> %a, <3 x float> %b) {
 ; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
@@ -858,7 +857,6 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NEXT:    ldr q2, [sp, #80] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #176
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll
index 2c76d969d6efe..2ab1610edcc7f 100644
--- a/llvm/test/CodeGen/AArch64/fsincos.ll
+++ b/llvm/test/CodeGen/AArch64/fsincos.ll
@@ -332,7 +332,6 @@ define <3 x float> @sin_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -703,7 +702,6 @@ define <7 x half> @sin_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
@@ -1591,7 +1589,6 @@ define <3 x float> @cos_v3f32(<3 x float> %a) {
 ; CHECK-GI-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #64
 ; CHECK-GI-NEXT:    ret
@@ -1962,7 +1959,6 @@ define <7 x half> @cos_v7f16(<7 x half> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[4], v3.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[5], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[6], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    add sp, sp, #160
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 683544a69ebe1..4b48bcc5508db 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -195,17 +195,16 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ;
 ; CHECK-GI-NOFP16-LABEL: sqrt_v7f16:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-GI-NOFP16-NEXT:    fsqrt v1.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
@@ -218,7 +217,6 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[5], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: sqrt_v7f16:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 2e8c93a00a0d8..e7352fe03d01a 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -177,15 +177,13 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ; CHECK-GI-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NEXT:    mov v4.s[2], w8
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-NEXT:    fmov s5, w8
-; CHECK-GI-NEXT:    mov v5.s[1], w8
-; CHECK-GI-NEXT:    mov v4.s[3], w8
-; CHECK-GI-NEXT:    mov v5.s[2], w8
-; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
-; CHECK-GI-NEXT:    mov v5.s[3], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index b0df5cb3d8371..5c2dd761bdc0d 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -233,7 +233,6 @@ define <3 x float> @insert_v3f32_0(<3 x float> %a, float %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -254,7 +253,6 @@ define <3 x float> @insert_v3f32_2(<3 x float> %a, float %b, i32 %c) {
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x float> %a, float %b, i32 2
@@ -766,7 +764,6 @@ define <3 x i32> @insert_v3i32_0(<3 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 0
@@ -785,7 +782,6 @@ define <3 x i32> @insert_v3i32_2(<3 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    fmov s1, w0
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %d = insertelement <3 x i32> %a, i32 %b, i32 2
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 708bb43887f86..2164c2aad2011 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -2605,7 +2605,6 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sitofp <3 x i64> %a to <3 x float>
@@ -2638,7 +2637,6 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = uitofp <3 x i64> %a to <3 x float>
@@ -3754,13 +3752,9 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -3771,8 +3765,6 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3809,13 +3801,9 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -3826,8 +3814,6 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h0, d0
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3876,7 +3862,6 @@ define <3 x half> @stofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -3925,7 +3910,6 @@ define <3 x half> @utofp_v3i64_v3f16(<3 x i64> %a) {
 ; CHECK-GI-FP16-NEXT:    fcvt h1, d1
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -4756,13 +4740,9 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4783,13 +4763,9 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -4997,13 +4973,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5012,13 +4984,9 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5048,13 +5016,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5063,13 +5027,9 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5551,13 +5511,9 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5566,19 +5522,13 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5622,13 +5572,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
 ; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], v0.s[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -5638,13 +5584,9 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-FP16-NEXT:    mov s1, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -5694,7 +5636,6 @@ define <3 x half> @stofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-GI-FP16-NEXT:    scvtf v0.4h, v0.4h
@@ -5744,7 +5685,6 @@ define <3 x half> @utofp_v3i8_v3f16(<3 x i8> %a) {
 ; CHECK-GI-FP16-NEXT:    fmov s1, w2
 ; CHECK-GI-FP16-NEXT:    mov v0.h[2], v1.h[0]
 ; CHECK-GI-FP16-NEXT:    movi d1, #0xff00ff00ff00ff
-; CHECK-GI-FP16-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-FP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-FP16-NEXT:    ucvtf v0.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 70df88ba9f898..56f4272c4363c 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -109,14 +109,11 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) {
 ; GISEL-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; GISEL-NEXT:    fmov s0, s1
 ; GISEL-NEXT:    bl exp10f
-; GISEL-NEXT:    fcvt h0, s0
-; GISEL-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; GISEL-NEXT:    fcvt h1, s0
+; GISEL-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr x30, [sp, #24] // 8-byte Folded Reload
 ; GISEL-NEXT:    ldr d8, [sp, #16] // 8-byte Folded Reload
-; GISEL-NEXT:    mov v1.h[1], v0.h[0]
-; GISEL-NEXT:    mov v1.h[2], v0.h[0]
-; GISEL-NEXT:    mov v1.h[3], v0.h[0]
-; GISEL-NEXT:    mov v0.16b, v1.16b
+; GISEL-NEXT:    mov v0.h[1], v1.h[0]
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; GISEL-NEXT:    add sp, sp, #32
 ; GISEL-NEXT:    ret
@@ -196,7 +193,6 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; GISEL-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; GISEL-NEXT:    mov v1.h[1], v2.h[0]
 ; GISEL-NEXT:    mov v1.h[2], v0.h[0]
-; GISEL-NEXT:    mov v1.h[3], v0.h[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; GISEL-NEXT:    add sp, sp, #64
@@ -440,7 +436,6 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) {
 ; GISEL-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
 ; GISEL-NEXT:    mov v1.s[1], v2.s[0]
 ; GISEL-NEXT:    mov v1.s[2], v0.s[0]
-; GISEL-NEXT:    mov v1.s[3], v0.s[0]
 ; GISEL-NEXT:    mov v0.16b, v1.16b
 ; GISEL-NEXT:    add sp, sp, #64
 ; GISEL-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index 7f4540d915ab3..39143e5c53ffd 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -245,7 +245,6 @@ define <7 x i8> @load_v7i8(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
 ; CHECK-GI-NEXT:    ldr b1, [x0, #6]
 ; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i8>, ptr %ptr
@@ -265,7 +264,6 @@ define <3 x i16> @load_v3i16(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    ldr h1, [x0, #4]
 ; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i16>, ptr %ptr
@@ -293,7 +291,6 @@ define <7 x i16> @load_v7i16(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
 ; CHECK-GI-NEXT:    ldr h1, [x0, #12]
 ; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
 ; CHECK-GI-NEXT:    ret
     %a = load <7 x i16>, ptr %ptr
     ret <7 x i16> %a
@@ -311,7 +308,6 @@ define <3 x i32> @load_v3i32(ptr %ptr){
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    ldr s1, [x0, #8]
 ; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
-; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
 ; CHECK-GI-NEXT:    ret
     %a = load <3 x i32>, ptr %ptr
     ret <3 x i32> %a
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index f319721e0f2f0..61f04fbf0484f 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -222,7 +222,6 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov v0.s[1], w1
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
@@ -252,8 +251,6 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v1.s[2], w2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
@@ -315,7 +312,6 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    smov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -390,7 +386,6 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov v0.s[1], w1
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #6
 ; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #6
@@ -420,8 +415,6 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v1.s[2], w2
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index ccc06f2e1058d..5287839ee7b70 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -594,7 +594,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = shl <1 x i32> %0, %1
@@ -697,7 +696,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = ashr <1 x i32> %0, %1
@@ -790,7 +788,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
 ; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
     %3 = lshr <1 x i32> %0, %1
@@ -851,16 +848,6 @@ define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    fmov s3, w5
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
 ; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
 ; CHECK-GI-NEXT:    umov w1, v0.b[1]
@@ -937,16 +924,6 @@ define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    fmov s2, w2
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
@@ -1027,16 +1004,6 @@ define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
 ; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    fmov s2, w2
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    neg v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    umov w0, v0.b[0]
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b408bc1c38976..d79f3ae11167f 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -210,8 +210,8 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-LABEL: shufflevector_v4i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov h2, v0.h[1]
 ; CHECK-GI-NEXT:    mov h3, v1.h[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    mov h4, v0.h[2]
@@ -224,14 +224,6 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
 ; CHECK-GI-NEXT:    mov v0.b[3], v5.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[3], v6.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -287,16 +279,12 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
 ; CHECK-GI-LABEL: shufflevector_v2i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-NEXT:    mov s3, v1.s[1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
 ; CHECK-GI-NEXT:    mov v0.h[1], v2.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-GI-NEXT:    mov v0.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
@@ -516,16 +504,6 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
 ; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI30_0]
 ; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
-; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[4], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[5], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
-; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
-; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
 ; CHECK-GI-NEXT:    mov v0.d[1], v2.d[0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index ab42e6463feee..bb4df6d8935b1 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -396,7 +396,7 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
 define i1 @add_ulecmp_bad_i16_i8(i16 %x) nounwind {
 ; CHECK-LABEL: add_ulecmp_bad_i16_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
   %tmp0 = add i16 %x, 128 ; 1U << (8-1)
   %tmp1 = icmp ule i16 %tmp0, -1 ; when we +1 it, it will wrap to 0
diff --git a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
index ccfbf456693d7..39edc03ced442 100644
--- a/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/AArch64/typepromotion-overflow.ll
@@ -246,9 +246,8 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    add w8, w8, #8
-; CHECK-NEXT:    and w8, w8, #0xff
-; CHECK-NEXT:    cmp w8, #252
+; CHECK-NEXT:    sub w8, w8, #248
+; CHECK-NEXT:    cmn w8, #4
 ; CHECK-NEXT:    cset w0, hi
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 21982fadbe803..3c86f4bf9eb21 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -298,7 +298,6 @@ define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
 ; CHECK-GI-NEXT:    mov s2, v0.s[2]
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -327,7 +326,6 @@ define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -353,7 +351,6 @@ define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    fmov x8, d2
 ; CHECK-GI-NEXT:    mov v0.s[1], w9
 ; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = trunc <3 x i64> %a to <3 x i32>
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index e513340f5b18a..54b29be2132cd 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -249,10 +249,8 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -280,8 +278,6 @@ define <3 x i32> @zext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -341,7 +337,6 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-NEXT:    umov w8, v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -420,10 +415,8 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -451,8 +444,6 @@ define <3 x i32> @zext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.s[2], w2
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
-; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
new file mode 100644
index 0000000000000..f68bb2049d006
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64
+; RUN: llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) {
+; CHECK-LABEL: test(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [test_param_0];
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK-NEXT:    atom.add.noftz.f16 %rs2, [%r1], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_param_1];
+; CHECK-NEXT:    atom.global.add.noftz.f16 %rs3, [%r2], %rs1;
+; CHECK-NEXT:    ld.param.u32 %r3, [test_param_2];
+; CHECK-NEXT:    atom.shared.add.noftz.f16 %rs4, [%r3], %rs1;
+; CHECK-NEXT:    ret;
+;
+; CHECK64-LABEL: test(
+; CHECK64:       {
+; CHECK64-NEXT:    .reg .b16 %rs<5>;
+; CHECK64-NEXT:    .reg .b64 %rd<4>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT:  // %bb.0:
+; CHECK64-NEXT:    ld.param.u64 %rd1, [test_param_0];
+; CHECK64-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECK64-NEXT:    atom.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd2, [test_param_1];
+; CHECK64-NEXT:    atom.global.add.noftz.f16 %rs3, [%rd2], %rs1;
+; CHECK64-NEXT:    ld.param.u64 %rd3, [test_param_2];
+; CHECK64-NEXT:    atom.shared.add.noftz.f16 %rs4, [%rd3], %rs1;
+; CHECK64-NEXT:    ret;
+;
+; CHECKPTX62-LABEL: test(
+; CHECKPTX62:       {
+; CHECKPTX62-NEXT:    .reg .pred %p<4>;
+; CHECKPTX62-NEXT:    .reg .b16 %rs<14>;
+; CHECKPTX62-NEXT:    .reg .b32 %r<49>;
+; CHECKPTX62-EMPTY:
+; CHECKPTX62-NEXT:  // %bb.0:
+; CHECKPTX62-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX62-NEXT:    ld.param.u32 %r20, [test_param_2];
+; CHECKPTX62-NEXT:    ld.param.u32 %r19, [test_param_1];
+; CHECKPTX62-NEXT:    ld.param.u32 %r21, [test_param_0];
+; CHECKPTX62-NEXT:    and.b32 %r1, %r21, -4;
+; CHECKPTX62-NEXT:    and.b32 %r22, %r21, 3;
+; CHECKPTX62-NEXT:    shl.b32 %r2, %r22, 3;
+; CHECKPTX62-NEXT:    mov.b32 %r23, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r24, %r23, %r2;
+; CHECKPTX62-NEXT:    not.b32 %r3, %r24;
+; CHECKPTX62-NEXT:    ld.u32 %r46, [%r1];
+; CHECKPTX62-NEXT:  $L__BB0_1: // %atomicrmw.start
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r25, %r46, %r2;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r25;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs4, %rs2, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r26, %rs4;
+; CHECKPTX62-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX62-NEXT:    and.b32 %r28, %r46, %r3;
+; CHECKPTX62-NEXT:    or.b32 %r29, %r28, %r27;
+; CHECKPTX62-NEXT:    atom.cas.b32 %r6, [%r1], %r46, %r29;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p1, %r6, %r46;
+; CHECKPTX62-NEXT:    mov.u32 %r46, %r6;
+; CHECKPTX62-NEXT:    @%p1 bra $L__BB0_1;
+; CHECKPTX62-NEXT:  // %bb.2: // %atomicrmw.end
+; CHECKPTX62-NEXT:    and.b32 %r7, %r19, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r30, %r19, 3;
+; CHECKPTX62-NEXT:    and.b32 %r8, %r30, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r32, %r23, %r8;
+; CHECKPTX62-NEXT:    not.b32 %r9, %r32;
+; CHECKPTX62-NEXT:    ld.global.u32 %r47, [%r7];
+; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start9
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r33, %r47, %r8;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs6, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r8;
+; CHECKPTX62-NEXT:    and.b32 %r36, %r47, %r9;
+; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX62-NEXT:    atom.global.cas.b32 %r12, [%r7], %r47, %r37;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p2, %r12, %r47;
+; CHECKPTX62-NEXT:    mov.u32 %r47, %r12;
+; CHECKPTX62-NEXT:    @%p2 bra $L__BB0_3;
+; CHECKPTX62-NEXT:  // %bb.4: // %atomicrmw.end8
+; CHECKPTX62-NEXT:    and.b32 %r13, %r20, -4;
+; CHECKPTX62-NEXT:    shl.b32 %r38, %r20, 3;
+; CHECKPTX62-NEXT:    and.b32 %r14, %r38, 24;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r23, %r14;
+; CHECKPTX62-NEXT:    not.b32 %r15, %r40;
+; CHECKPTX62-NEXT:    ld.shared.u32 %r48, [%r13];
+; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start27
+; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECKPTX62-NEXT:    shr.u32 %r41, %r48, %r14;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs10, %r41;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs12, %rs10, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs12;
+; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r14;
+; CHECKPTX62-NEXT:    and.b32 %r44, %r48, %r15;
+; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX62-NEXT:    atom.shared.cas.b32 %r18, [%r13], %r48, %r45;
+; CHECKPTX62-NEXT:    setp.ne.s32 %p3, %r18, %r48;
+; CHECKPTX62-NEXT:    mov.u32 %r48, %r18;
+; CHECKPTX62-NEXT:    @%p3 bra $L__BB0_5;
+; CHECKPTX62-NEXT:  // %bb.6: // %atomicrmw.end26
+; CHECKPTX62-NEXT:    ret;
+  %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
+  %r2 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
+  %ret = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+  ret void
+}
+
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index e99d0fd05e346..6f2b5dcf47f13 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -175,6 +175,13 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
   ret float %ret
 }
 
+; CHECK-LABEL: atomicrmw_add_f16_generic
+define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
+; CHECK: atom.cas
+  %ret = atomicrmw fadd ptr %addr, half %val seq_cst
+  ret half %ret
+}
+
 ; CHECK-LABEL: atomicrmw_add_f32_addrspace1
 define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
 ; CHECK: atom.global.add.f32
diff --git a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
index 3740dc675949f..ec7e0ecce80ca 100644
--- a/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
+++ b/llvm/test/CodeGen/RISCV/typepromotion-overflow.ll
@@ -283,9 +283,8 @@ define i32 @safe_sub_var_imm(ptr nocapture readonly %b) local_unnamed_addr #1 {
 ; CHECK-LABEL: safe_sub_var_imm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lbu a0, 0(a0)
-; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    andi a0, a0, 255
-; CHECK-NEXT:    sltiu a0, a0, 253
+; CHECK-NEXT:    addi a0, a0, -248
+; CHECK-NEXT:    sltiu a0, a0, -3
 ; CHECK-NEXT:    xori a0, a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
index 8369a44dcbad2..afecf00113a0a 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64     < %s | FileCheck -check-prefixes=CHECK,REST,X64 %s
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=CHECK,REST,SLM %s
 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=CHECK,REST,SKL %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=goldmont   < %s | FileCheck -check-prefixes=CHECK,REST,GMT %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=gracemont  < %s | FileCheck -check-prefixes=CHECK,REST,GMT %s
 ; RUN: llc -profile-summary-huge-working-set-size-threshold=1 -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake    < %s | FileCheck -check-prefixes=HUGEWS %s
 
 ; Verify that div32 is bypassed only for Atoms.
@@ -117,6 +119,13 @@ define i64 @div64(i64 %a, i64 %b) {
 ; SKL-NEXT:    # kill: def $eax killed $eax def $rax
 ; SKL-NEXT:    retq
 ;
+; GMT-LABEL: div64:
+; GMT:       # %bb.0: # %entry
+; GMT-NEXT:    movq %rdi, %rax
+; GMT-NEXT:    cqto
+; GMT-NEXT:    idivq %rsi
+; GMT-NEXT:    retq
+;
 ; HUGEWS-LABEL: div64:
 ; HUGEWS:       # %bb.0: # %entry
 ; HUGEWS-NEXT:    movq %rdi, %rax
@@ -240,6 +249,13 @@ define i64 @div64_hugews(i64 %a, i64 %b) {
 ; SKL-NEXT:    # kill: def $eax killed $eax def $rax
 ; SKL-NEXT:    retq
 ;
+; GMT-LABEL: div64_hugews:
+; GMT:       # %bb.0:
+; GMT-NEXT:    movq %rdi, %rax
+; GMT-NEXT:    cqto
+; GMT-NEXT:    idivq %rsi
+; GMT-NEXT:    retq
+;
 ; HUGEWS-LABEL: div64_hugews:
 ; HUGEWS:       # %bb.0:
 ; HUGEWS-NEXT:    movq %rdi, %rax
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index b485a9b10f26c..5472e1e6c0833 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -1,9 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (shl 0, x) -> 0
 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@@ -137,32 +138,40 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
-; AVX-SLOW:       # %bb.0:
-; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT:    vzeroupper
-; AVX-SLOW-NEXT:    retq
-;
-; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
-; AVX-FAST-ALL:       # %bb.0:
-; AVX-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
-; AVX-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; AVX-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-FAST-ALL-NEXT:    vzeroupper
-; AVX-FAST-ALL-NEXT:    retq
-;
-; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
-; AVX-FAST-PERLANE:       # %bb.0:
-; AVX-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-FAST-PERLANE-NEXT:    vzeroupper
-; AVX-FAST-PERLANE-NEXT:    retq
+; AVX2-SLOW-LABEL: combine_vec_shl_trunc_and:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
+; AVX2-FAST-ALL:       # %bb.0:
+; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
+; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-ALL-NEXT:    vzeroupper
+; AVX2-FAST-ALL-NEXT:    retq
+;
+; AVX2-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
+; AVX2-FAST-PERLANE:       # %bb.0:
+; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT:    vzeroupper
+; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = shl <4 x i32> %x, %2
@@ -353,11 +362,17 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
 ; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_zext_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_zext_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_zext_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    retq
   %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
   %2 = zext <8 x i16> %1 to <8 x i32>
   %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -504,12 +519,18 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_gt_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_gt_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_gt_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %2
@@ -540,12 +561,18 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_le_lshr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_le_lshr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_le_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %2
@@ -587,11 +614,16 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_ashr0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_ashr0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_ashr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %2
@@ -620,12 +652,18 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
 ; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_add0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_add0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_add0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -667,12 +705,18 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
 ; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_or0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_or0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_or0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -724,11 +768,16 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
 ; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_shl_mul0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
-; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_shl_mul0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_shl_mul0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -778,12 +827,18 @@ define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shl_nonsplat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shl_nonsplat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shl_nonsplat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
   %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
   ret <4 x i32> %2
@@ -812,14 +867,22 @@ define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shl_and_nonsplat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shl_and_nonsplat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
   %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
   %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
@@ -847,13 +910,20 @@ define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0)  {
 ; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_add_shuffle_shl:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
-; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_add_shuffle_shl:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_add_shuffle_shl:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX512-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
   %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
   %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index cc0ed2b8268c6..0675ced68d7a7 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (sra 0, x) -> 0
 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
@@ -85,19 +87,33 @@ define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_ashr_ashr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $10, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrad $6, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $8, %xmm1
-; SSE-NEXT:    psrad $4, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_ashr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $10, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $8, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $6, %xmm1
+; SSE2-NEXT:    psrad $4, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_ashr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $10, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $6, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $8, %xmm1
+; SSE41-NEXT:    psrad $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_ashr1:
 ; AVX:       # %bb.0:
@@ -124,16 +140,30 @@ define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_ashr_ashr3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrad $27, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrad $15, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_ashr3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $27, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; SSE2-NEXT:    psrad $15, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_ashr3:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrad $27, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrad $15, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrad $31, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_ashr3:
 ; AVX:       # %bb.0:
@@ -146,26 +176,48 @@ define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
 
 ; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
 define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
-; SSE-LABEL: combine_vec_ashr_trunc_and:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrad %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrad %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrad %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrad %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_and:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrad %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrad %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrad %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_and:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrad %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrad %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrad %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
 ; AVX2-SLOW:       # %bb.0:
@@ -193,6 +245,14 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %x, %2
@@ -202,17 +262,31 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
 ;      if c1 is equal to the number of bits the trunc removes
 define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_ashr_trunc_lshr:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    psrad $2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    psrad $1, %xmm0
-; SSE-NEXT:    psrad $3, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_lshr:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    psrad $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_lshr:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $1, %xmm0
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
 ; AVX2-SLOW:       # %bb.0:
@@ -237,6 +311,14 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@@ -255,16 +337,23 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
 ; SSE-NEXT:    packsswb %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_ashr_trunc_lshr_splat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrad $26, %ymm1, %ymm1
-; AVX-NEXT:    vpsrad $26, %ymm0, %ymm0
-; AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_lshr_splat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrad $26, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $26, %ymm0, %ymm0
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_lshr_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $26, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <16 x i32> %x, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
   %2 = trunc <16 x i32> %1 to <16 x i8>
   %3 = ashr <16 x i8> %2, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -274,17 +363,31 @@ define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
 ; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
 ;      if c1 is equal to the number of bits the trunc removes
 define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_ashr_trunc_ashr:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    psrad $2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    psrad $1, %xmm0
-; SSE-NEXT:    psrad $3, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_trunc_ashr:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $3, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    psrad $2, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrad $1, %xmm1
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_trunc_ashr:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; SSE41-NEXT:    movaps %xmm0, %xmm2
+; SSE41-NEXT:    psrad $2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    psrad $1, %xmm0
+; SSE41-NEXT:    psrad $3, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
 ; AVX2-SLOW:       # %bb.0:
@@ -309,6 +412,14 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@@ -323,13 +434,20 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_ashr_trunc_ashr_splat:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrad $19, %ymm0, %ymm0
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_ashr_trunc_ashr_splat:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrad $19, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_ashr_trunc_ashr_splat:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $19, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = ashr <8 x i32> %x, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
   %2 = trunc <8 x i32> %1 to <8 x i16>
   %3 = ashr <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -338,25 +456,46 @@ define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
 
 ; If the sign bit is known to be zero, switch this to a SRL.
 define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
-; SSE-LABEL: combine_vec_ashr_positive:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrld %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrld %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_ashr_positive:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_ashr_positive:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_ashr_positive:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index b38ab5d262814..33649e6d87b91 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-ALL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ; fold (srl 0, x) -> 0
 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
@@ -101,19 +103,33 @@ define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_lshr_lshr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $10, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $6, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $8, %xmm1
-; SSE-NEXT:    psrld $4, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_lshr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $10, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld $8, %xmm2
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $6, %xmm1
+; SSE2-NEXT:    psrld $4, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_lshr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $10, %xmm1
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrld $6, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    psrld $8, %xmm1
+; SSE41-NEXT:    psrld $4, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lshr1:
 ; AVX:       # %bb.0:
@@ -157,12 +173,19 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
 
 ; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
 define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_lshr_trunc_lshr0:
-; SSE:       # %bb.0:
-; SSE-NEXT:    psrlq $48, %xmm1
-; SSE-NEXT:    psrlq $48, %xmm0
-; SSE-NEXT:    packusdw %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_lshr0:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrlq $48, %xmm1
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_lshr0:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrlq $48, %xmm1
+; SSE41-NEXT:    psrlq $48, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr0:
 ; AVX2-SLOW:       # %bb.0:
@@ -188,6 +211,13 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_lshr0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlq $48, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16>
@@ -195,27 +225,50 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
-; SSE-LABEL: combine_vec_lshr_trunc_lshr1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    psrlq $35, %xmm2
-; SSE-NEXT:    psrlq $34, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $33, %xmm2
-; SSE-NEXT:    psrlq $32, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
-; SSE-NEXT:    movaps %xmm2, %xmm1
-; SSE-NEXT:    psrld $19, %xmm1
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    psrld $17, %xmm3
-; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    psrld $18, %xmm2
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_lshr1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrlq $34, %xmm2
+; SSE2-NEXT:    psrlq $35, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    psrlq $33, %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    psrld $19, %xmm1
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    psrld $18, %xmm3
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSE2-NEXT:    psrld $17, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_lshr1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrlq $35, %xmm2
+; SSE41-NEXT:    psrlq $34, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
+; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    psrld $19, %xmm1
+; SSE41-NEXT:    movaps %xmm2, %xmm3
+; SSE41-NEXT:    psrld $17, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT:    psrld $18, %xmm2
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
 ; AVX2-SLOW:       # %bb.0:
@@ -243,6 +296,14 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_lshr1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
+; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
@@ -289,11 +350,16 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
 ; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_shl_mask0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_lshr_shl_mask0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
+; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_shl_mask0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 =  shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
   %2 = lshr <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
   ret <4 x i32> %2
@@ -338,12 +404,18 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 ; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $4, %xmm0, %xmm0
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX2-LABEL: combine_vec_lshr_lzcnt_bit0:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_lzcnt_bit0:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
@@ -351,47 +423,98 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
-; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pshufb %xmm0, %xmm2
-; SSE-NEXT:    psrlw $4, %xmm0
-; SSE-NEXT:    pxor %xmm3, %xmm3
-; SSE-NEXT:    pshufb %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    paddb %xmm1, %xmm0
-; SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    paddd %xmm3, %xmm0
-; SSE-NEXT:    psrld $5, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_lzcnt_bit1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $2, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $4, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $8, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm1, %xmm0
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    psadbw %xmm1, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    psrld $5, %xmm0
+; SSE2-NEXT:    retq
 ;
-; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE41-LABEL: combine_vec_lshr_lzcnt_bit1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    pshufb %xmm0, %xmm2
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm3, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    paddb %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT:    pand %xmm0, %xmm1
+; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    paddd %xmm3, %xmm0
+; SSE41-NEXT:    psrld $5, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: combine_vec_lshr_lzcnt_bit1:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm4
+; AVX2-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_lzcnt_bit1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %1 = and <4 x i32> %x, <i32 4, i32 32, i32 64, i32 128>
   %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0)
   %3 = lshr <4 x i32> %2, <i32 5, i32 5, i32 5, i32 5>
@@ -401,26 +524,48 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
 
 ; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
 define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
-; SSE-LABEL: combine_vec_lshr_trunc_and:
-; SSE:       # %bb.0:
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm2, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm5
-; SSE-NEXT:    psrld %xmm4, %xmm5
-; SSE-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    psrld %xmm1, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE-NEXT:    psrld %xmm1, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_vec_lshr_trunc_and:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psrld %xmm2, %xmm3
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrld %xmm4, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psrld %xmm3, %xmm4
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT:    psrld %xmm1, %xmm0
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: combine_vec_lshr_trunc_and:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm2, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psrld %xmm4, %xmm5
+; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psrld %xmm1, %xmm3
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT:    psrld %xmm1, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
 ; AVX2-SLOW:       # %bb.0:
@@ -448,6 +593,14 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
 ; AVX2-FAST-PERLANE-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vzeroupper
 ; AVX2-FAST-PERLANE-NEXT:    retq
+;
+; AVX512-LABEL: combine_vec_lshr_trunc_and:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
   %2 = trunc <4 x i64> %1 to <4 x i32>
   %3 = lshr <4 x i32> %x, %2
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
index 99c9c4aee4a76..b77f8e0a31927 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_mubuf.s
@@ -5,6 +5,21 @@
 // ENC_MUBUF.
 //===----------------------------------------------------------------------===//
 
+buffer_load_format_d16_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xy v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_load_format_d16_hi_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01]
+
 buffer_load_format_x v5, off, s[8:11], s3 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
@@ -221,6 +236,21 @@ buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 dlc
 buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 glc slc dlc
 // GFX10: encoding: [0xff,0xcf,0x0c,0xe0,0x00,0x05,0x42,0x03]
 
+buffer_store_format_d16_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xy v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
+
+buffer_store_format_d16_hi_x v1, off, s[4:7], s1
+// GFX10: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01]
+
 buffer_store_format_x v1, off, s[12:15], s4 offset:4095
 // GFX10: encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
diff --git a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
index f235280874c4a..56add346bd21f 100644
--- a/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
+++ b/llvm/test/MC/AMDGPU/mtbuf-gfx10.s
@@ -11,6 +11,9 @@ tbuffer_load_format_d16_x v0, off, s[0:3], format:22, 0
 // GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80]
 tbuffer_load_format_d16_xy v0, off, s[0:3], format:22, 0
 
+// GFX10: tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb2,0xe8,0x00,0x00,0x20,0x80]
+tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], format:22, 0
+
 // GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT] ; encoding: [0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80]
 tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], format:22, 0
 
@@ -62,6 +65,9 @@ tbuffer_store_format_d16_x v0, v1, s[4:7], format:33, 0 idxen
 // GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80]
 tbuffer_store_format_d16_xy v0, v1, s[4:7], format:33, 0 idxen
 
+// GFX10: tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0e,0xe9,0x02,0x00,0x21,0x80]
+tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], format:33, 0 idxen
+
 // GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; encoding: [0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80]
 tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], format:33, 0 idxen
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
index 950ce783baba2..b6232e84549ba 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mtbuf.txt
@@ -6,6 +6,9 @@
 # GFX10: tbuffer_load_format_d16_xy v0, off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
 0x00,0x00,0xb1,0xe8,0x00,0x00,0x20,0x80
 
+# GFX10: tbuffer_load_format_d16_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
+0x00,0x00,0xb2,0xe8,0x00,0x00,0x20,0x80
+
 # GFX10: tbuffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_32_FLOAT]
 0x00,0x00,0xb3,0xe8,0x00,0x00,0x20,0x80
 
@@ -57,6 +60,9 @@
 # GFX10: tbuffer_store_format_d16_xy v0, v1, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 0x00,0x20,0x0d,0xe9,0x01,0x00,0x21,0x80
 
+# GFX10: tbuffer_store_format_d16_xyz v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
+0x00,0x20,0x0e,0xe9,0x02,0x00,0x21,0x80
+
 # GFX10: tbuffer_store_format_d16_xyzw v[0:1], v2, s[4:7], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen
 0x00,0x20,0x0f,0xe9,0x02,0x00,0x21,0x80
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
index 6fbe77e43ad42..849c89e37011f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mubuf.txt
@@ -1316,6 +1316,21 @@
 # GFX10: buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x05,0x02,0x03]
 0xff,0x1f,0x38,0xe0,0x00,0x05,0x02,0x03
 
+# GFX10: buffer_load_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x00,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x04,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x08,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x0c,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_load_format_d16_hi_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01]
+0x00,0x00,0x98,0xe0,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_load_format_x v255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03]
 0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03
 
@@ -2015,6 +2030,21 @@
 # GFX10: buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x03,0x04]
 0xff,0x0f,0x78,0xe0,0x00,0xfc,0x03,0x04
 
+# GFX10: buffer_store_format_d16_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x10,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xy v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x14,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xyz v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x18,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_xyzw v[1:2], off, s[4:7], s1 ; encoding: [0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01]
+0x00,0x00,0x1c,0xe2,0x00,0x01,0x01,0x01
+
+# GFX10: buffer_store_format_d16_hi_x v1, off, s[4:7], s1 ; encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01]
+0x00,0x00,0x9c,0xe0,0x00,0x01,0x01,0x01
+
 # GFX10: buffer_store_format_x v1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1]
 0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1
 
diff --git a/llvm/test/MC/Disassembler/X86/apx/ctest.txt b/llvm/test/MC/Disassembler/X86/apx/ctest.txt
new file mode 100644
index 0000000000000..9a29a98b5d788
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/ctest.txt
@@ -0,0 +1,1026 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbb {dfv=of} $123, %bl
+# INTEL: ctestb {dfv=of} bl, 123
+0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b
+
+# ATT:   ctestbw {dfv=of} $1234, %dx
+# INTEL: ctestb {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestbl {dfv=of} $123456, %ecx
+# INTEL: ctestb {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbq {dfv=of} $123456, %r9
+# INTEL: ctestb {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbb {dfv=of} %bl, %dl
+# INTEL: ctestb {dfv=of} dl, bl
+0x62,0xf4,0x44,0x02,0x84,0xda
+
+# ATT:   ctestbw {dfv=of} %dx, %ax
+# INTEL: ctestb {dfv=of} ax, dx
+0x62,0xf4,0x45,0x02,0x85,0xd0
+
+# ATT:   ctestbl {dfv=of} %ecx, %edx
+# INTEL: ctestb {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x02,0x85,0xca
+
+# ATT:   ctestbq {dfv=of} %r9, %r15
+# INTEL: ctestb {dfv=of} r15, r9
+0x62,0x54,0xc4,0x02,0x85,0xcf
+
+# ATT:   ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestbeb {dfv=of} $123, %bl
+# INTEL: ctestbe {dfv=of} bl, 123
+0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b
+
+# ATT:   ctestbew {dfv=of} $1234, %dx
+# INTEL: ctestbe {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestbel {dfv=of} $123456, %ecx
+# INTEL: ctestbe {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeq {dfv=of} $123456, %r9
+# INTEL: ctestbe {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestbeb {dfv=of} %bl, %dl
+# INTEL: ctestbe {dfv=of} dl, bl
+0x62,0xf4,0x44,0x06,0x84,0xda
+
+# ATT:   ctestbew {dfv=of} %dx, %ax
+# INTEL: ctestbe {dfv=of} ax, dx
+0x62,0xf4,0x45,0x06,0x85,0xd0
+
+# ATT:   ctestbel {dfv=of} %ecx, %edx
+# INTEL: ctestbe {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x06,0x85,0xca
+
+# ATT:   ctestbeq {dfv=of} %r9, %r15
+# INTEL: ctestbe {dfv=of} r15, r9
+0x62,0x54,0xc4,0x06,0x85,0xcf
+
+# ATT:   ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestfb {dfv=of} $123, %bl
+# INTEL: ctestf {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b
+
+# ATT:   ctestfw {dfv=of} $1234, %dx
+# INTEL: ctestf {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestfl {dfv=of} $123456, %ecx
+# INTEL: ctestf {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfq {dfv=of} $123456, %r9
+# INTEL: ctestf {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestfb {dfv=of} %bl, %dl
+# INTEL: ctestf {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0b,0x84,0xda
+
+# ATT:   ctestfw {dfv=of} %dx, %ax
+# INTEL: ctestf {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0b,0x85,0xd0
+
+# ATT:   ctestfl {dfv=of} %ecx, %edx
+# INTEL: ctestf {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0b,0x85,0xca
+
+# ATT:   ctestfq {dfv=of} %r9, %r15
+# INTEL: ctestf {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0b,0x85,0xcf
+
+# ATT:   ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestlb {dfv=of} $123, %bl
+# INTEL: ctestl {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b
+
+# ATT:   ctestlw {dfv=of} $1234, %dx
+# INTEL: ctestl {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestll {dfv=of} $123456, %ecx
+# INTEL: ctestl {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlq {dfv=of} $123456, %r9
+# INTEL: ctestl {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestlb {dfv=of} %bl, %dl
+# INTEL: ctestl {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0c,0x84,0xda
+
+# ATT:   ctestlw {dfv=of} %dx, %ax
+# INTEL: ctestl {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0c,0x85,0xd0
+
+# ATT:   ctestll {dfv=of} %ecx, %edx
+# INTEL: ctestl {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0c,0x85,0xca
+
+# ATT:   ctestlq {dfv=of} %r9, %r15
+# INTEL: ctestl {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0c,0x85,0xcf
+
+# ATT:   ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestleb {dfv=of} $123, %bl
+# INTEL: ctestle {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b
+
+# ATT:   ctestlew {dfv=of} $1234, %dx
+# INTEL: ctestle {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestlel {dfv=of} $123456, %ecx
+# INTEL: ctestle {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleq {dfv=of} $123456, %r9
+# INTEL: ctestle {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestleb {dfv=of} %bl, %dl
+# INTEL: ctestle {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0e,0x84,0xda
+
+# ATT:   ctestlew {dfv=of} %dx, %ax
+# INTEL: ctestle {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0e,0x85,0xd0
+
+# ATT:   ctestlel {dfv=of} %ecx, %edx
+# INTEL: ctestle {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0e,0x85,0xca
+
+# ATT:   ctestleq {dfv=of} %r9, %r15
+# INTEL: ctestle {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0e,0x85,0xcf
+
+# ATT:   ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaeb {dfv=of} $123, %bl
+# INTEL: ctestae {dfv=of} bl, 123
+0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b
+
+# ATT:   ctestaew {dfv=of} $1234, %dx
+# INTEL: ctestae {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestael {dfv=of} $123456, %ecx
+# INTEL: ctestae {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeq {dfv=of} $123456, %r9
+# INTEL: ctestae {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaeb {dfv=of} %bl, %dl
+# INTEL: ctestae {dfv=of} dl, bl
+0x62,0xf4,0x44,0x03,0x84,0xda
+
+# ATT:   ctestaew {dfv=of} %dx, %ax
+# INTEL: ctestae {dfv=of} ax, dx
+0x62,0xf4,0x45,0x03,0x85,0xd0
+
+# ATT:   ctestael {dfv=of} %ecx, %edx
+# INTEL: ctestae {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x03,0x85,0xca
+
+# ATT:   ctestaeq {dfv=of} %r9, %r15
+# INTEL: ctestae {dfv=of} r15, r9
+0x62,0x54,0xc4,0x03,0x85,0xcf
+
+# ATT:   ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestab {dfv=of} $123, %bl
+# INTEL: ctesta {dfv=of} bl, 123
+0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b
+
+# ATT:   ctestaw {dfv=of} $1234, %dx
+# INTEL: ctesta {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestal {dfv=of} $123456, %ecx
+# INTEL: ctesta {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestaq {dfv=of} $123456, %r9
+# INTEL: ctesta {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestab {dfv=of} %bl, %dl
+# INTEL: ctesta {dfv=of} dl, bl
+0x62,0xf4,0x44,0x07,0x84,0xda
+
+# ATT:   ctestaw {dfv=of} %dx, %ax
+# INTEL: ctesta {dfv=of} ax, dx
+0x62,0xf4,0x45,0x07,0x85,0xd0
+
+# ATT:   ctestal {dfv=of} %ecx, %edx
+# INTEL: ctesta {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x07,0x85,0xca
+
+# ATT:   ctestaq {dfv=of} %r9, %r15
+# INTEL: ctesta {dfv=of} r15, r9
+0x62,0x54,0xc4,0x07,0x85,0xcf
+
+# ATT:   ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgeb {dfv=of} $123, %bl
+# INTEL: ctestge {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b
+
+# ATT:   ctestgew {dfv=of} $1234, %dx
+# INTEL: ctestge {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestgel {dfv=of} $123456, %ecx
+# INTEL: ctestge {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeq {dfv=of} $123456, %r9
+# INTEL: ctestge {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgeb {dfv=of} %bl, %dl
+# INTEL: ctestge {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0d,0x84,0xda
+
+# ATT:   ctestgew {dfv=of} %dx, %ax
+# INTEL: ctestge {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0d,0x85,0xd0
+
+# ATT:   ctestgel {dfv=of} %ecx, %edx
+# INTEL: ctestge {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0d,0x85,0xca
+
+# ATT:   ctestgeq {dfv=of} %r9, %r15
+# INTEL: ctestge {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0d,0x85,0xcf
+
+# ATT:   ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestgb {dfv=of} $123, %bl
+# INTEL: ctestg {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b
+
+# ATT:   ctestgw {dfv=of} $1234, %dx
+# INTEL: ctestg {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestgl {dfv=of} $123456, %ecx
+# INTEL: ctestg {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgq {dfv=of} $123456, %r9
+# INTEL: ctestg {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestgb {dfv=of} %bl, %dl
+# INTEL: ctestg {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0f,0x84,0xda
+
+# ATT:   ctestgw {dfv=of} %dx, %ax
+# INTEL: ctestg {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0f,0x85,0xd0
+
+# ATT:   ctestgl {dfv=of} %ecx, %edx
+# INTEL: ctestg {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0f,0x85,0xca
+
+# ATT:   ctestgq {dfv=of} %r9, %r15
+# INTEL: ctestg {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0f,0x85,0xcf
+
+# ATT:   ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnob {dfv=of} $123, %bl
+# INTEL: ctestno {dfv=of} bl, 123
+0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b
+
+# ATT:   ctestnow {dfv=of} $1234, %dx
+# INTEL: ctestno {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnol {dfv=of} $123456, %ecx
+# INTEL: ctestno {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnoq {dfv=of} $123456, %r9
+# INTEL: ctestno {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnob {dfv=of} %bl, %dl
+# INTEL: ctestno {dfv=of} dl, bl
+0x62,0xf4,0x44,0x01,0x84,0xda
+
+# ATT:   ctestnow {dfv=of} %dx, %ax
+# INTEL: ctestno {dfv=of} ax, dx
+0x62,0xf4,0x45,0x01,0x85,0xd0
+
+# ATT:   ctestnol {dfv=of} %ecx, %edx
+# INTEL: ctestno {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x01,0x85,0xca
+
+# ATT:   ctestnoq {dfv=of} %r9, %r15
+# INTEL: ctestno {dfv=of} r15, r9
+0x62,0x54,0xc4,0x01,0x85,0xcf
+
+# ATT:   ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestnsb {dfv=of} $123, %bl
+# INTEL: ctestns {dfv=of} bl, 123
+0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b
+
+# ATT:   ctestnsw {dfv=of} $1234, %dx
+# INTEL: ctestns {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnsl {dfv=of} $123456, %ecx
+# INTEL: ctestns {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsq {dfv=of} $123456, %r9
+# INTEL: ctestns {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestnsb {dfv=of} %bl, %dl
+# INTEL: ctestns {dfv=of} dl, bl
+0x62,0xf4,0x44,0x09,0x84,0xda
+
+# ATT:   ctestnsw {dfv=of} %dx, %ax
+# INTEL: ctestns {dfv=of} ax, dx
+0x62,0xf4,0x45,0x09,0x85,0xd0
+
+# ATT:   ctestnsl {dfv=of} %ecx, %edx
+# INTEL: ctestns {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x09,0x85,0xca
+
+# ATT:   ctestnsq {dfv=of} %r9, %r15
+# INTEL: ctestns {dfv=of} r15, r9
+0x62,0x54,0xc4,0x09,0x85,0xcf
+
+# ATT:   ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestneb {dfv=of} $123, %bl
+# INTEL: ctestne {dfv=of} bl, 123
+0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b
+
+# ATT:   ctestnew {dfv=of} $1234, %dx
+# INTEL: ctestne {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestnel {dfv=of} $123456, %ecx
+# INTEL: ctestne {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneq {dfv=of} $123456, %r9
+# INTEL: ctestne {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestneb {dfv=of} %bl, %dl
+# INTEL: ctestne {dfv=of} dl, bl
+0x62,0xf4,0x44,0x05,0x84,0xda
+
+# ATT:   ctestnew {dfv=of} %dx, %ax
+# INTEL: ctestne {dfv=of} ax, dx
+0x62,0xf4,0x45,0x05,0x85,0xd0
+
+# ATT:   ctestnel {dfv=of} %ecx, %edx
+# INTEL: ctestne {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x05,0x85,0xca
+
+# ATT:   ctestneq {dfv=of} %r9, %r15
+# INTEL: ctestne {dfv=of} r15, r9
+0x62,0x54,0xc4,0x05,0x85,0xcf
+
+# ATT:   ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestob {dfv=of} $123, %bl
+# INTEL: ctesto {dfv=of} bl, 123
+0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b
+
+# ATT:   ctestow {dfv=of} $1234, %dx
+# INTEL: ctesto {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestol {dfv=of} $123456, %ecx
+# INTEL: ctesto {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestoq {dfv=of} $123456, %r9
+# INTEL: ctesto {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestob {dfv=of} %bl, %dl
+# INTEL: ctesto {dfv=of} dl, bl
+0x62,0xf4,0x44,0x00,0x84,0xda
+
+# ATT:   ctestow {dfv=of} %dx, %ax
+# INTEL: ctesto {dfv=of} ax, dx
+0x62,0xf4,0x45,0x00,0x85,0xd0
+
+# ATT:   ctestol {dfv=of} %ecx, %edx
+# INTEL: ctesto {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x00,0x85,0xca
+
+# ATT:   ctestoq {dfv=of} %r9, %r15
+# INTEL: ctesto {dfv=of} r15, r9
+0x62,0x54,0xc4,0x00,0x85,0xcf
+
+# ATT:   ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctestsb {dfv=of} $123, %bl
+# INTEL: ctests {dfv=of} bl, 123
+0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b
+
+# ATT:   ctestsw {dfv=of} $1234, %dx
+# INTEL: ctests {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestsl {dfv=of} $123456, %ecx
+# INTEL: ctests {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsq {dfv=of} $123456, %r9
+# INTEL: ctests {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctestsb {dfv=of} %bl, %dl
+# INTEL: ctests {dfv=of} dl, bl
+0x62,0xf4,0x44,0x08,0x84,0xda
+
+# ATT:   ctestsw {dfv=of} %dx, %ax
+# INTEL: ctests {dfv=of} ax, dx
+0x62,0xf4,0x45,0x08,0x85,0xd0
+
+# ATT:   ctestsl {dfv=of} %ecx, %edx
+# INTEL: ctests {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x08,0x85,0xca
+
+# ATT:   ctestsq {dfv=of} %r9, %r15
+# INTEL: ctests {dfv=of} r15, r9
+0x62,0x54,0xc4,0x08,0x85,0xcf
+
+# ATT:   ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b
+
+# ATT:   ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesttb {dfv=of} $123, %bl
+# INTEL: ctestt {dfv=of} bl, 123
+0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b
+
+# ATT:   ctesttw {dfv=of} $1234, %dx
+# INTEL: ctestt {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctesttl {dfv=of} $123456, %ecx
+# INTEL: ctestt {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttq {dfv=of} $123456, %r9
+# INTEL: ctestt {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesttb {dfv=of} %bl, %dl
+# INTEL: ctestt {dfv=of} dl, bl
+0x62,0xf4,0x44,0x0a,0x84,0xda
+
+# ATT:   ctesttw {dfv=of} %dx, %ax
+# INTEL: ctestt {dfv=of} ax, dx
+0x62,0xf4,0x45,0x0a,0x85,0xd0
+
+# ATT:   ctesttl {dfv=of} %ecx, %edx
+# INTEL: ctestt {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x0a,0x85,0xca
+
+# ATT:   ctesttq {dfv=of} %r9, %r15
+# INTEL: ctestt {dfv=of} r15, r9
+0x62,0x54,0xc4,0x0a,0x85,0xcf
+
+# ATT:   ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b
+
+# ATT:   ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04
+
+# ATT:   ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b
+
+# ATT:   ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b
+
+# ATT:   ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# INTEL: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b
+
+# ATT:   ctesteb {dfv=of} $123, %bl
+# INTEL: cteste {dfv=of} bl, 123
+0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b
+
+# ATT:   ctestew {dfv=of} $1234, %dx
+# INTEL: cteste {dfv=of} dx, 1234
+0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04
+
+# ATT:   ctestel {dfv=of} $123456, %ecx
+# INTEL: cteste {dfv=of} ecx, 123456
+0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteq {dfv=of} $123456, %r9
+# INTEL: cteste {dfv=of} r9, 123456
+0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00
+
+# ATT:   ctesteb {dfv=of} %bl, %dl
+# INTEL: cteste {dfv=of} dl, bl
+0x62,0xf4,0x44,0x04,0x84,0xda
+
+# ATT:   ctestew {dfv=of} %dx, %ax
+# INTEL: cteste {dfv=of} ax, dx
+0x62,0xf4,0x45,0x04,0x85,0xd0
+
+# ATT:   ctestel {dfv=of} %ecx, %edx
+# INTEL: cteste {dfv=of} edx, ecx
+0x62,0xf4,0x44,0x04,0x85,0xca
+
+# ATT:   ctesteq {dfv=of} %r9, %r15
+# INTEL: cteste {dfv=of} r15, r9
+0x62,0x54,0xc4,0x04,0x85,0xcf
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
new file mode 100644
index 0000000000000..b9e98adc9841b
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -0,0 +1,773 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-256: error:
+# ERROR-NOT: error:
+# CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestbb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestbw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestbq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestbb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
+         ctestbb {dfv=of} $123, %bl
+# CHECK: ctestbw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04]
+         ctestbw {dfv=of} $1234, %dx
+# CHECK: ctestbl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbl {dfv=of} $123456, %ecx
+# CHECK: ctestbq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbq {dfv=of} $123456, %r9
+# CHECK: ctestbb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x84,0xda]
+         ctestbb {dfv=of} %bl, %dl
+# CHECK: ctestbw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0x85,0xd0]
+         ctestbw {dfv=of} %dx, %ax
+# CHECK: ctestbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestbl {dfv=of} %ecx, %edx
+# CHECK: ctestbq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0xcf]
+         ctestbq {dfv=of} %r9, %r15
+# CHECK: ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b]
+         ctestbeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b]
+         ctestbew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestbeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b]
+         ctestbeb {dfv=of} $123, %bl
+# CHECK: ctestbew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04]
+         ctestbew {dfv=of} $1234, %dx
+# CHECK: ctestbel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbel {dfv=of} $123456, %ecx
+# CHECK: ctestbeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbeq {dfv=of} $123456, %r9
+# CHECK: ctestbeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x84,0xda]
+         ctestbeb {dfv=of} %bl, %dl
+# CHECK: ctestbew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0x85,0xd0]
+         ctestbew {dfv=of} %dx, %ax
+# CHECK: ctestbel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestbel {dfv=of} %ecx, %edx
+# CHECK: ctestbeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0xcf]
+         ctestbeq {dfv=of} %r9, %r15
+# CHECK: ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestfb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestfw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestfl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestfq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b]
+         ctestfb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b]
+         ctestfw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestfl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestfq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestfb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b]
+         ctestfb {dfv=of} $123, %bl
+# CHECK: ctestfw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04]
+         ctestfw {dfv=of} $1234, %dx
+# CHECK: ctestfl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestfl {dfv=of} $123456, %ecx
+# CHECK: ctestfq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestfq {dfv=of} $123456, %r9
+# CHECK: ctestfb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x84,0xda]
+         ctestfb {dfv=of} %bl, %dl
+# CHECK: ctestfw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0x85,0xd0]
+         ctestfw {dfv=of} %dx, %ax
+# CHECK: ctestfl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x85,0xca]
+         ctestfl {dfv=of} %ecx, %edx
+# CHECK: ctestfq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0xcf]
+         ctestfq {dfv=of} %r9, %r15
+# CHECK: ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestlb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestlw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestll {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestlq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b]
+         ctestlb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b]
+         ctestlw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestll {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestlq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestlb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b]
+         ctestlb {dfv=of} $123, %bl
+# CHECK: ctestlw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04]
+         ctestlw {dfv=of} $1234, %dx
+# CHECK: ctestll {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestll {dfv=of} $123456, %ecx
+# CHECK: ctestlq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestlq {dfv=of} $123456, %r9
+# CHECK: ctestlb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x84,0xda]
+         ctestlb {dfv=of} %bl, %dl
+# CHECK: ctestlw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0x85,0xd0]
+         ctestlw {dfv=of} %dx, %ax
+# CHECK: ctestll {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestll {dfv=of} %ecx, %edx
+# CHECK: ctestlq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0xcf]
+         ctestlq {dfv=of} %r9, %r15
+# CHECK: ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestleb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestlew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestlel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestleq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b]
+         ctestleb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b]
+         ctestlew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestlel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestleq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestleb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b]
+         ctestleb {dfv=of} $123, %bl
+# CHECK: ctestlew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04]
+         ctestlew {dfv=of} $1234, %dx
+# CHECK: ctestlel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestlel {dfv=of} $123456, %ecx
+# CHECK: ctestleq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestleq {dfv=of} $123456, %r9
+# CHECK: ctestleb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x84,0xda]
+         ctestleb {dfv=of} %bl, %dl
+# CHECK: ctestlew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0x85,0xd0]
+         ctestlew {dfv=of} %dx, %ax
+# CHECK: ctestlel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestlel {dfv=of} %ecx, %edx
+# CHECK: ctestleq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0xcf]
+         ctestleq {dfv=of} %r9, %r15
+# CHECK: ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestaeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestaew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestael {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestaeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b]
+         ctestaeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b]
+         ctestaew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b]
+         ctestael {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b]
+         ctestaeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestaeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b]
+         ctestaeb {dfv=of} $123, %bl
+# CHECK: ctestaew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04]
+         ctestaew {dfv=of} $1234, %dx
+# CHECK: ctestael {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestael {dfv=of} $123456, %ecx
+# CHECK: ctestaeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestaeq {dfv=of} $123456, %r9
+# CHECK: ctestaeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x84,0xda]
+         ctestaeb {dfv=of} %bl, %dl
+# CHECK: ctestaew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0x85,0xd0]
+         ctestaew {dfv=of} %dx, %ax
+# CHECK: ctestael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestael {dfv=of} %ecx, %edx
+# CHECK: ctestaeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0xcf]
+         ctestaeq {dfv=of} %r9, %r15
+# CHECK: ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestab {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestaw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestal {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestaq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b]
+         ctestab {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b]
+         ctestaw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b]
+         ctestal {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b]
+         ctestaq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestab {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b]
+         ctestab {dfv=of} $123, %bl
+# CHECK: ctestaw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04]
+         ctestaw {dfv=of} $1234, %dx
+# CHECK: ctestal {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestal {dfv=of} $123456, %ecx
+# CHECK: ctestaq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestaq {dfv=of} $123456, %r9
+# CHECK: ctestab {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x84,0xda]
+         ctestab {dfv=of} %bl, %dl
+# CHECK: ctestaw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0x85,0xd0]
+         ctestaw {dfv=of} %dx, %ax
+# CHECK: ctestal {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctestal {dfv=of} %ecx, %edx
+# CHECK: ctestaq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0xcf]
+         ctestaq {dfv=of} %r9, %r15
+# CHECK: ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestgeb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestgew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgeq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b]
+         ctestgeb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b]
+         ctestgew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestgel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestgeq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestgeb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b]
+         ctestgeb {dfv=of} $123, %bl
+# CHECK: ctestgew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04]
+         ctestgew {dfv=of} $1234, %dx
+# CHECK: ctestgel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgel {dfv=of} $123456, %ecx
+# CHECK: ctestgeq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgeq {dfv=of} $123456, %r9
+# CHECK: ctestgeb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x84,0xda]
+         ctestgeb {dfv=of} %bl, %dl
+# CHECK: ctestgew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0x85,0xd0]
+         ctestgew {dfv=of} %dx, %ax
+# CHECK: ctestgel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestgel {dfv=of} %ecx, %edx
+# CHECK: ctestgeq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0xcf]
+         ctestgeq {dfv=of} %r9, %r15
+# CHECK: ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestgb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestgw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestgq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b]
+         ctestgb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b]
+         ctestgw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestgl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestgq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestgb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b]
+         ctestgb {dfv=of} $123, %bl
+# CHECK: ctestgw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04]
+         ctestgw {dfv=of} $1234, %dx
+# CHECK: ctestgl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgl {dfv=of} $123456, %ecx
+# CHECK: ctestgq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestgq {dfv=of} $123456, %r9
+# CHECK: ctestgb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x84,0xda]
+         ctestgb {dfv=of} %bl, %dl
+# CHECK: ctestgw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0x85,0xd0]
+         ctestgw {dfv=of} %dx, %ax
+# CHECK: ctestgl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestgl {dfv=of} %ecx, %edx
+# CHECK: ctestgq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0xcf]
+         ctestgq {dfv=of} %r9, %r15
+# CHECK: ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestnob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b]
+         ctestnob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b]
+         ctestnow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b]
+         ctestnol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b]
+         ctestnoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestnob {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b]
+         ctestnob {dfv=of} $123, %bl
+# CHECK: ctestnow {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04]
+         ctestnow {dfv=of} $1234, %dx
+# CHECK: ctestnol {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnol {dfv=of} $123456, %ecx
+# CHECK: ctestnoq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnoq {dfv=of} $123456, %r9
+# CHECK: ctestnob {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x84,0xda]
+         ctestnob {dfv=of} %bl, %dl
+# CHECK: ctestnow {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0x85,0xd0]
+         ctestnow {dfv=of} %dx, %ax
+# CHECK: ctestnol {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x85,0xca]
+         ctestnol {dfv=of} %ecx, %edx
+# CHECK: ctestnoq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0xcf]
+         ctestnoq {dfv=of} %r9, %r15
+# CHECK: ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestnsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b]
+         ctestnsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b]
+         ctestnsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b]
+         ctestnsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b]
+         ctestnsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestnsb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b]
+         ctestnsb {dfv=of} $123, %bl
+# CHECK: ctestnsw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04]
+         ctestnsw {dfv=of} $1234, %dx
+# CHECK: ctestnsl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnsl {dfv=of} $123456, %ecx
+# CHECK: ctestnsq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnsq {dfv=of} $123456, %r9
+# CHECK: ctestnsb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x84,0xda]
+         ctestnsb {dfv=of} %bl, %dl
+# CHECK: ctestnsw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0x85,0xd0]
+         ctestnsw {dfv=of} %dx, %ax
+# CHECK: ctestnsl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x85,0xca]
+         ctestnsl {dfv=of} %ecx, %edx
+# CHECK: ctestnsq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0xcf]
+         ctestnsq {dfv=of} %r9, %r15
+# CHECK: ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestneb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestnew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestnel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestneq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b]
+         ctestneb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b]
+         ctestnew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b]
+         ctestnel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b]
+         ctestneq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestneb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b]
+         ctestneb {dfv=of} $123, %bl
+# CHECK: ctestnew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04]
+         ctestnew {dfv=of} $1234, %dx
+# CHECK: ctestnel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestnel {dfv=of} $123456, %ecx
+# CHECK: ctestneq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestneq {dfv=of} $123456, %r9
+# CHECK: ctestneb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x84,0xda]
+         ctestneb {dfv=of} %bl, %dl
+# CHECK: ctestnew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0x85,0xd0]
+         ctestnew {dfv=of} %dx, %ax
+# CHECK: ctestnel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestnel {dfv=of} %ecx, %edx
+# CHECK: ctestneq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0xcf]
+         ctestneq {dfv=of} %r9, %r15
+# CHECK: ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestob {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestow {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestol {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestoq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b]
+         ctestob {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b]
+         ctestow {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b]
+         ctestol {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b]
+         ctestoq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestob {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b]
+         ctestob {dfv=of} $123, %bl
+# CHECK: ctestow {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04]
+         ctestow {dfv=of} $1234, %dx
+# CHECK: ctestol {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestol {dfv=of} $123456, %ecx
+# CHECK: ctestoq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestoq {dfv=of} $123456, %r9
+# CHECK: ctestob {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x84,0xda]
+         ctestob {dfv=of} %bl, %dl
+# CHECK: ctestow {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0x85,0xd0]
+         ctestow {dfv=of} %dx, %ax
+# CHECK: ctestol {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x85,0xca]
+         ctestol {dfv=of} %ecx, %edx
+# CHECK: ctestoq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0xcf]
+         ctestoq {dfv=of} %r9, %r15
+# CHECK: ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestsb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestsw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestsl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestsq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b]
+         ctestsb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b]
+         ctestsw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b]
+         ctestsl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b]
+         ctestsq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctestsb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b]
+         ctestsb {dfv=of} $123, %bl
+# CHECK: ctestsw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04]
+         ctestsw {dfv=of} $1234, %dx
+# CHECK: ctestsl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestsl {dfv=of} $123456, %ecx
+# CHECK: ctestsq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestsq {dfv=of} $123456, %r9
+# CHECK: ctestsb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x84,0xda]
+         ctestsb {dfv=of} %bl, %dl
+# CHECK: ctestsw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0x85,0xd0]
+         ctestsw {dfv=of} %dx, %ax
+# CHECK: ctestsl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x85,0xca]
+         ctestsl {dfv=of} %ecx, %edx
+# CHECK: ctestsq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0xcf]
+         ctestsq {dfv=of} %r9, %r15
+# CHECK: ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesttb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesttw {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesttl {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesttq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b]
+         ctesttb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b]
+         ctesttw {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b]
+         ctesttl {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b]
+         ctesttq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctesttb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b]
+         ctesttb {dfv=of} $123, %bl
+# CHECK: ctesttw {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04]
+         ctesttw {dfv=of} $1234, %dx
+# CHECK: ctesttl {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesttl {dfv=of} $123456, %ecx
+# CHECK: ctesttq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesttq {dfv=of} $123456, %r9
+# CHECK: ctesttb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x84,0xda]
+         ctesttb {dfv=of} %bl, %dl
+# CHECK: ctesttw {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0x85,0xd0]
+         ctesttw {dfv=of} %dx, %ax
+# CHECK: ctesttl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x85,0xca]
+         ctesttl {dfv=of} %ecx, %edx
+# CHECK: ctesttq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0xcf]
+         ctesttq {dfv=of} %r9, %r15
+# CHECK: ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesteb {dfv=of} $123, 123(%r8,%rax,4)
+# CHECK: ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestew {dfv=of} $1234, 123(%r8,%rax,4)
+# CHECK: ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestel {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesteq {dfv=of} $123456, 123(%r8,%rax,4)
+# CHECK: ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b]
+         ctesteb {dfv=of} %bl, 123(%r8,%rax,4)
+# CHECK: ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b]
+         ctestew {dfv=of} %dx, 123(%r8,%rax,4)
+# CHECK: ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b]
+         ctestel {dfv=of} %ecx, 123(%r8,%rax,4)
+# CHECK: ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b]
+         ctesteq {dfv=of} %r9, 123(%r8,%rax,4)
+# CHECK: ctesteb {dfv=of} $123, %bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b]
+         ctesteb {dfv=of} $123, %bl
+# CHECK: ctestew {dfv=of} $1234, %dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04]
+         ctestew {dfv=of} $1234, %dx
+# CHECK: ctestel {dfv=of} $123456, %ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestel {dfv=of} $123456, %ecx
+# CHECK: ctesteq {dfv=of} $123456, %r9
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesteq {dfv=of} $123456, %r9
+# CHECK: ctesteb {dfv=of} %bl, %dl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x84,0xda]
+         ctesteb {dfv=of} %bl, %dl
+# CHECK: ctestew {dfv=of} %dx, %ax
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0x85,0xd0]
+         ctestew {dfv=of} %dx, %ax
+# CHECK: ctestel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         ctestel {dfv=of} %ecx, %edx
+# CHECK: ctesteq {dfv=of} %r9, %r15
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
+         ctesteq {dfv=of} %r9, %r15
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
new file mode 100644
index 0000000000000..17cea489b4765
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -0,0 +1,770 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestb {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x84,0x5c,0x80,0x7b]
+         ctestb {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x02,0x85,0x54,0x80,0x7b]
+         ctestb {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0x4c,0x80,0x7b]
+         ctestb {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestb {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf6,0xc3,0x7b]
+         ctestb {dfv=of} bl, 123
+# CHECK: ctestb {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0xf7,0xc2,0xd2,0x04]
+         ctestb {dfv=of} dx, 1234
+# CHECK: ctestb {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} ecx, 123456
+# CHECK: ctestb {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x02,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestb {dfv=of} r9, 123456
+# CHECK: ctestb {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x84,0xda]
+         ctestb {dfv=of} dl, bl
+# CHECK: ctestb {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x02,0x85,0xd0]
+         ctestb {dfv=of} ax, dx
+# CHECK: ctestb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestb {dfv=of} edx, ecx
+# CHECK: ctestb {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x02,0x85,0xcf]
+         ctestb {dfv=of} r15, r9
+# CHECK: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x84,0x5c,0x80,0x7b]
+         ctestbe {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x06,0x85,0x54,0x80,0x7b]
+         ctestbe {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbe {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0x4c,0x80,0x7b]
+         ctestbe {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestbe {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf6,0xc3,0x7b]
+         ctestbe {dfv=of} bl, 123
+# CHECK: ctestbe {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0xf7,0xc2,0xd2,0x04]
+         ctestbe {dfv=of} dx, 1234
+# CHECK: ctestbe {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} ecx, 123456
+# CHECK: ctestbe {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x06,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestbe {dfv=of} r9, 123456
+# CHECK: ctestbe {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x84,0xda]
+         ctestbe {dfv=of} dl, bl
+# CHECK: ctestbe {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x06,0x85,0xd0]
+         ctestbe {dfv=of} ax, dx
+# CHECK: ctestbe {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestbe {dfv=of} edx, ecx
+# CHECK: ctestbe {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x06,0x85,0xcf]
+         ctestbe {dfv=of} r15, r9
+# CHECK: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestf {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x84,0x5c,0x80,0x7b]
+         ctestf {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0b,0x85,0x54,0x80,0x7b]
+         ctestf {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestf {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0x4c,0x80,0x7b]
+         ctestf {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestf {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf6,0xc3,0x7b]
+         ctestf {dfv=of} bl, 123
+# CHECK: ctestf {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0xf7,0xc2,0xd2,0x04]
+         ctestf {dfv=of} dx, 1234
+# CHECK: ctestf {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} ecx, 123456
+# CHECK: ctestf {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0b,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestf {dfv=of} r9, 123456
+# CHECK: ctestf {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x84,0xda]
+         ctestf {dfv=of} dl, bl
+# CHECK: ctestf {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0b,0x85,0xd0]
+         ctestf {dfv=of} ax, dx
+# CHECK: ctestf {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0b,0x85,0xca]
+         ctestf {dfv=of} edx, ecx
+# CHECK: ctestf {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0b,0x85,0xcf]
+         ctestf {dfv=of} r15, r9
+# CHECK: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestl {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x84,0x5c,0x80,0x7b]
+         ctestl {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0c,0x85,0x54,0x80,0x7b]
+         ctestl {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestl {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0x4c,0x80,0x7b]
+         ctestl {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestl {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf6,0xc3,0x7b]
+         ctestl {dfv=of} bl, 123
+# CHECK: ctestl {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0xf7,0xc2,0xd2,0x04]
+         ctestl {dfv=of} dx, 1234
+# CHECK: ctestl {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} ecx, 123456
+# CHECK: ctestl {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0c,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestl {dfv=of} r9, 123456
+# CHECK: ctestl {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x84,0xda]
+         ctestl {dfv=of} dl, bl
+# CHECK: ctestl {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0c,0x85,0xd0]
+         ctestl {dfv=of} ax, dx
+# CHECK: ctestl {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestl {dfv=of} edx, ecx
+# CHECK: ctestl {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0c,0x85,0xcf]
+         ctestl {dfv=of} r15, r9
+# CHECK: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestle {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x84,0x5c,0x80,0x7b]
+         ctestle {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0e,0x85,0x54,0x80,0x7b]
+         ctestle {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestle {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0x4c,0x80,0x7b]
+         ctestle {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestle {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf6,0xc3,0x7b]
+         ctestle {dfv=of} bl, 123
+# CHECK: ctestle {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0xf7,0xc2,0xd2,0x04]
+         ctestle {dfv=of} dx, 1234
+# CHECK: ctestle {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} ecx, 123456
+# CHECK: ctestle {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0e,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestle {dfv=of} r9, 123456
+# CHECK: ctestle {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x84,0xda]
+         ctestle {dfv=of} dl, bl
+# CHECK: ctestle {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0e,0x85,0xd0]
+         ctestle {dfv=of} ax, dx
+# CHECK: ctestle {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestle {dfv=of} edx, ecx
+# CHECK: ctestle {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0e,0x85,0xcf]
+         ctestle {dfv=of} r15, r9
+# CHECK: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestae {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x84,0x5c,0x80,0x7b]
+         ctestae {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x03,0x85,0x54,0x80,0x7b]
+         ctestae {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x03,0x85,0x4c,0x80,0x7b]
+         ctestae {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0x4c,0x80,0x7b]
+         ctestae {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestae {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf6,0xc3,0x7b]
+         ctestae {dfv=of} bl, 123
+# CHECK: ctestae {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0xf7,0xc2,0xd2,0x04]
+         ctestae {dfv=of} dx, 1234
+# CHECK: ctestae {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} ecx, 123456
+# CHECK: ctestae {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x03,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestae {dfv=of} r9, 123456
+# CHECK: ctestae {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x84,0xda]
+         ctestae {dfv=of} dl, bl
+# CHECK: ctestae {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x03,0x85,0xd0]
+         ctestae {dfv=of} ax, dx
+# CHECK: ctestae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestae {dfv=of} edx, ecx
+# CHECK: ctestae {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x03,0x85,0xcf]
+         ctestae {dfv=of} r15, r9
+# CHECK: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesta {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x84,0x5c,0x80,0x7b]
+         ctesta {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x07,0x85,0x54,0x80,0x7b]
+         ctesta {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x07,0x85,0x4c,0x80,0x7b]
+         ctesta {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0x4c,0x80,0x7b]
+         ctesta {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctesta {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf6,0xc3,0x7b]
+         ctesta {dfv=of} bl, 123
+# CHECK: ctesta {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0xf7,0xc2,0xd2,0x04]
+         ctesta {dfv=of} dx, 1234
+# CHECK: ctesta {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} ecx, 123456
+# CHECK: ctesta {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x07,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesta {dfv=of} r9, 123456
+# CHECK: ctesta {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x84,0xda]
+         ctesta {dfv=of} dl, bl
+# CHECK: ctesta {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x07,0x85,0xd0]
+         ctesta {dfv=of} ax, dx
+# CHECK: ctesta {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctesta {dfv=of} edx, ecx
+# CHECK: ctesta {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x07,0x85,0xcf]
+         ctesta {dfv=of} r15, r9
+# CHECK: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestge {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x84,0x5c,0x80,0x7b]
+         ctestge {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0d,0x85,0x54,0x80,0x7b]
+         ctestge {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestge {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0x4c,0x80,0x7b]
+         ctestge {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestge {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf6,0xc3,0x7b]
+         ctestge {dfv=of} bl, 123
+# CHECK: ctestge {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0xf7,0xc2,0xd2,0x04]
+         ctestge {dfv=of} dx, 1234
+# CHECK: ctestge {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} ecx, 123456
+# CHECK: ctestge {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0d,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestge {dfv=of} r9, 123456
+# CHECK: ctestge {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x84,0xda]
+         ctestge {dfv=of} dl, bl
+# CHECK: ctestge {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0d,0x85,0xd0]
+         ctestge {dfv=of} ax, dx
+# CHECK: ctestge {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestge {dfv=of} edx, ecx
+# CHECK: ctestge {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0d,0x85,0xcf]
+         ctestge {dfv=of} r15, r9
+# CHECK: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestg {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x84,0x5c,0x80,0x7b]
+         ctestg {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0f,0x85,0x54,0x80,0x7b]
+         ctestg {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestg {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0x4c,0x80,0x7b]
+         ctestg {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestg {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf6,0xc3,0x7b]
+         ctestg {dfv=of} bl, 123
+# CHECK: ctestg {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0xf7,0xc2,0xd2,0x04]
+         ctestg {dfv=of} dx, 1234
+# CHECK: ctestg {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} ecx, 123456
+# CHECK: ctestg {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0f,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestg {dfv=of} r9, 123456
+# CHECK: ctestg {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x84,0xda]
+         ctestg {dfv=of} dl, bl
+# CHECK: ctestg {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0f,0x85,0xd0]
+         ctestg {dfv=of} ax, dx
+# CHECK: ctestg {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestg {dfv=of} edx, ecx
+# CHECK: ctestg {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0f,0x85,0xcf]
+         ctestg {dfv=of} r15, r9
+# CHECK: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestno {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x84,0x5c,0x80,0x7b]
+         ctestno {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x01,0x85,0x54,0x80,0x7b]
+         ctestno {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x01,0x85,0x4c,0x80,0x7b]
+         ctestno {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0x4c,0x80,0x7b]
+         ctestno {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestno {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf6,0xc3,0x7b]
+         ctestno {dfv=of} bl, 123
+# CHECK: ctestno {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0xf7,0xc2,0xd2,0x04]
+         ctestno {dfv=of} dx, 1234
+# CHECK: ctestno {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} ecx, 123456
+# CHECK: ctestno {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x01,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestno {dfv=of} r9, 123456
+# CHECK: ctestno {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x84,0xda]
+         ctestno {dfv=of} dl, bl
+# CHECK: ctestno {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x01,0x85,0xd0]
+         ctestno {dfv=of} ax, dx
+# CHECK: ctestno {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x01,0x85,0xca]
+         ctestno {dfv=of} edx, ecx
+# CHECK: ctestno {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x01,0x85,0xcf]
+         ctestno {dfv=of} r15, r9
+# CHECK: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestns {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x84,0x5c,0x80,0x7b]
+         ctestns {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x09,0x85,0x54,0x80,0x7b]
+         ctestns {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x09,0x85,0x4c,0x80,0x7b]
+         ctestns {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0x4c,0x80,0x7b]
+         ctestns {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestns {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf6,0xc3,0x7b]
+         ctestns {dfv=of} bl, 123
+# CHECK: ctestns {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0xf7,0xc2,0xd2,0x04]
+         ctestns {dfv=of} dx, 1234
+# CHECK: ctestns {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} ecx, 123456
+# CHECK: ctestns {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x09,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestns {dfv=of} r9, 123456
+# CHECK: ctestns {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x84,0xda]
+         ctestns {dfv=of} dl, bl
+# CHECK: ctestns {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x09,0x85,0xd0]
+         ctestns {dfv=of} ax, dx
+# CHECK: ctestns {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x09,0x85,0xca]
+         ctestns {dfv=of} edx, ecx
+# CHECK: ctestns {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x09,0x85,0xcf]
+         ctestns {dfv=of} r15, r9
+# CHECK: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestne {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x84,0x5c,0x80,0x7b]
+         ctestne {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x05,0x85,0x54,0x80,0x7b]
+         ctestne {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x05,0x85,0x4c,0x80,0x7b]
+         ctestne {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0x4c,0x80,0x7b]
+         ctestne {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestne {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf6,0xc3,0x7b]
+         ctestne {dfv=of} bl, 123
+# CHECK: ctestne {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0xf7,0xc2,0xd2,0x04]
+         ctestne {dfv=of} dx, 1234
+# CHECK: ctestne {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} ecx, 123456
+# CHECK: ctestne {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x05,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestne {dfv=of} r9, 123456
+# CHECK: ctestne {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x84,0xda]
+         ctestne {dfv=of} dl, bl
+# CHECK: ctestne {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x05,0x85,0xd0]
+         ctestne {dfv=of} ax, dx
+# CHECK: ctestne {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestne {dfv=of} edx, ecx
+# CHECK: ctestne {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x05,0x85,0xcf]
+         ctestne {dfv=of} r15, r9
+# CHECK: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf6,0x44,0x80,0x7b,0x7b]
+         ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctesto {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x84,0x5c,0x80,0x7b]
+         ctesto {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x00,0x85,0x54,0x80,0x7b]
+         ctesto {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x00,0x85,0x4c,0x80,0x7b]
+         ctesto {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0x4c,0x80,0x7b]
+         ctesto {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctesto {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf6,0xc3,0x7b]
+         ctesto {dfv=of} bl, 123
+# CHECK: ctesto {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0xf7,0xc2,0xd2,0x04]
+         ctesto {dfv=of} dx, 1234
+# CHECK: ctesto {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} ecx, 123456
+# CHECK: ctesto {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x00,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctesto {dfv=of} r9, 123456
+# CHECK: ctesto {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x84,0xda]
+         ctesto {dfv=of} dl, bl
+# CHECK: ctesto {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x00,0x85,0xd0]
+         ctesto {dfv=of} ax, dx
+# CHECK: ctesto {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x00,0x85,0xca]
+         ctesto {dfv=of} edx, ecx
+# CHECK: ctesto {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x00,0x85,0xcf]
+         ctesto {dfv=of} r15, r9
+# CHECK: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf6,0x44,0x80,0x7b,0x7b]
+         ctests {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctests {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x84,0x5c,0x80,0x7b]
+         ctests {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x08,0x85,0x54,0x80,0x7b]
+         ctests {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x08,0x85,0x4c,0x80,0x7b]
+         ctests {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0x4c,0x80,0x7b]
+         ctests {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctests {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf6,0xc3,0x7b]
+         ctests {dfv=of} bl, 123
+# CHECK: ctests {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0xf7,0xc2,0xd2,0x04]
+         ctests {dfv=of} dx, 1234
+# CHECK: ctests {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} ecx, 123456
+# CHECK: ctests {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x08,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctests {dfv=of} r9, 123456
+# CHECK: ctests {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x84,0xda]
+         ctests {dfv=of} dl, bl
+# CHECK: ctests {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x08,0x85,0xd0]
+         ctests {dfv=of} ax, dx
+# CHECK: ctests {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x08,0x85,0xca]
+         ctests {dfv=of} edx, ecx
+# CHECK: ctests {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x08,0x85,0xcf]
+         ctests {dfv=of} r15, r9
+# CHECK: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf6,0x44,0x80,0x7b,0x7b]
+         ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         ctestt {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x84,0x5c,0x80,0x7b]
+         ctestt {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x0a,0x85,0x54,0x80,0x7b]
+         ctestt {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x0a,0x85,0x4c,0x80,0x7b]
+         ctestt {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0x4c,0x80,0x7b]
+         ctestt {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: ctestt {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf6,0xc3,0x7b]
+         ctestt {dfv=of} bl, 123
+# CHECK: ctestt {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0xf7,0xc2,0xd2,0x04]
+         ctestt {dfv=of} dx, 1234
+# CHECK: ctestt {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} ecx, 123456
+# CHECK: ctestt {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x0a,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         ctestt {dfv=of} r9, 123456
+# CHECK: ctestt {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x84,0xda]
+         ctestt {dfv=of} dl, bl
+# CHECK: ctestt {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x0a,0x85,0xd0]
+         ctestt {dfv=of} ax, dx
+# CHECK: ctestt {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0a,0x85,0xca]
+         ctestt {dfv=of} edx, ecx
+# CHECK: ctestt {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x0a,0x85,0xcf]
+         ctestt {dfv=of} r15, r9
+# CHECK: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf6,0x44,0x80,0x7b,0x7b]
+         cteste {dfv=of} byte ptr [r8 + 4*rax + 123], 123
+# CHECK: cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0xf7,0x44,0x80,0x7b,0xd2,0x04]
+         cteste {dfv=of} word ptr [r8 + 4*rax + 123], 1234
+# CHECK: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} qword ptr [r8 + 4*rax + 123], 123456
+# CHECK: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0xf7,0x44,0x80,0x7b,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} dword ptr [r8 + 4*rax + 123], 123456
+# CHECK: cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x84,0x5c,0x80,0x7b]
+         cteste {dfv=of} byte ptr [r8 + 4*rax + 123], bl
+# CHECK: cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: encoding: [0x62,0xd4,0x45,0x04,0x85,0x54,0x80,0x7b]
+         cteste {dfv=of} word ptr [r8 + 4*rax + 123], dx
+# CHECK: cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: encoding: [0x62,0xd4,0x44,0x04,0x85,0x4c,0x80,0x7b]
+         cteste {dfv=of} dword ptr [r8 + 4*rax + 123], ecx
+# CHECK: cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0x4c,0x80,0x7b]
+         cteste {dfv=of} qword ptr [r8 + 4*rax + 123], r9
+# CHECK: cteste {dfv=of} bl, 123
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf6,0xc3,0x7b]
+         cteste {dfv=of} bl, 123
+# CHECK: cteste {dfv=of} dx, 1234
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0xf7,0xc2,0xd2,0x04]
+         cteste {dfv=of} dx, 1234
+# CHECK: cteste {dfv=of} ecx, 123456
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} ecx, 123456
+# CHECK: cteste {dfv=of} r9, 123456
+# CHECK: encoding: [0x62,0xd4,0xc4,0x04,0xf7,0xc1,0x40,0xe2,0x01,0x00]
+         cteste {dfv=of} r9, 123456
+# CHECK: cteste {dfv=of} dl, bl
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x84,0xda]
+         cteste {dfv=of} dl, bl
+# CHECK: cteste {dfv=of} ax, dx
+# CHECK: encoding: [0x62,0xf4,0x45,0x04,0x85,0xd0]
+         cteste {dfv=of} ax, dx
+# CHECK: cteste {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         cteste {dfv=of} edx, ecx
+# CHECK: cteste {dfv=of} r15, r9
+# CHECK: encoding: [0x62,0x54,0xc4,0x04,0x85,0xcf]
+         cteste {dfv=of} r15, r9
diff --git a/llvm/test/MachineVerifier/test_g_extract_subvector.mir b/llvm/test/MachineVerifier/test_g_extract_subvector.mir
new file mode 100644
index 0000000000000..bc167d2eb7bcd
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_extract_subvector.mir
@@ -0,0 +1,31 @@
+# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+---
+name:            g_extract_subvector
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: G_EXTRACT_SUBVECTOR first source must be a register
+    %3:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR 1, 0
+
+    ; CHECK: G_EXTRACT_SUBVECTOR index must be an immediate
+    %4:_(<vscale x 1 x s32>) = G_EXTRACT_SUBVECTOR %2, %0
+
+    ; CHECK: Destination type must be a vector
+    %5:_(s32) = G_EXTRACT_SUBVECTOR %2, 0
+
+    ; CHECK: First source must be a vector
+    %6:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR %0, 0
+
+    %7:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+
+    ; CHECK: Element type of vectors must be the same
+    %8:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR %7, 0
+
+    ; CHECK: Index must be a multiple of the source vector's minimum vector length
+    %9:_(<vscale x 4 x s32>) = G_EXTRACT_SUBVECTOR  %1, 3
+...
diff --git a/llvm/test/MachineVerifier/test_g_insert_subvector.mir b/llvm/test/MachineVerifier/test_g_insert_subvector.mir
new file mode 100644
index 0000000000000..dce30cdb6b1e5
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_insert_subvector.mir
@@ -0,0 +1,43 @@
+# RUN: not --crash llc -o - -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+
+---
+name:            g_splat_vector
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    %0:_(s32) = G_CONSTANT i32 0
+    %1:_(<vscale x 2 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 1 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: G_INSERT_SUBVECTOR first source must be a register
+    %3:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR 1, %2, 0
+
+    ; CHECK: G_INSERT_SUBVECTOR second source must be a register
+    %4:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, 1, 0
+
+    ; CHECK: G_INSERT_SUBVECTOR index must be an immediate
+    %5:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %2, %0
+
+    ; CHECK: Destination type must be a vector
+    %6:_(s32) = G_INSERT_SUBVECTOR %1, %2, 0
+
+    ; CHECK: First source must be a vector
+    %7:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %0, %2, 0
+
+    ; CHECK: Second source must be a vector
+    %8:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %0, 0
+
+    ; CHECK: Destination type must match the first source vector type
+    %9:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %2, %1, 0
+
+    %10:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+
+    ; CHECK: Element type of source vectors must be the same
+    %11:_(<vscale x 2 x s32>) = G_INSERT_SUBVECTOR %1, %10, 0
+
+    %12:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+
+    ; CHECK: Index must be a multiple of the second source vector's minimum vector length
+    %13:_(<vscale x 4 x s32>) = G_INSERT_SUBVECTOR %12, %1, 3
+...
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index e0fccd42e47f7..eea4f87cae9ce 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -2363,10 +2363,10 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VBLENDPDrri, X86::VBLENDPDrmi, 0},
   {X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0},
   {X86::VBLENDPSrri, X86::VBLENDPSrmi, 0},
-  {X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0},
-  {X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0},
-  {X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0},
-  {X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0},
+  {X86::VBLENDVPDYrrr, X86::VBLENDVPDYrmr, 0},
+  {X86::VBLENDVPDrrr, X86::VBLENDVPDrmr, 0},
+  {X86::VBLENDVPSYrrr, X86::VBLENDVPSYrmr, 0},
+  {X86::VBLENDVPSrrr, X86::VBLENDVPSrmr, 0},
   {X86::VBROADCASTF32X2Z256rrkz, X86::VBROADCASTF32X2Z256rmkz, TB_NO_REVERSE},
   {X86::VBROADCASTF32X2Zrrkz, X86::VBROADCASTF32X2Zrmkz, TB_NO_REVERSE},
   {X86::VBROADCASTI32X2Z128rrkz, X86::VBROADCASTI32X2Z128rmkz, TB_NO_REVERSE},
@@ -3042,8 +3042,8 @@ static const X86FoldTableEntry Table2[] = {
   {X86::VPBLENDMWZ128rr, X86::VPBLENDMWZ128rm, 0},
   {X86::VPBLENDMWZ256rr, X86::VPBLENDMWZ256rm, 0},
   {X86::VPBLENDMWZrr, X86::VPBLENDMWZrm, 0},
-  {X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0},
-  {X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0},
+  {X86::VPBLENDVBYrrr, X86::VPBLENDVBYrmr, 0},
+  {X86::VPBLENDVBrrr, X86::VPBLENDVBrmr, 0},
   {X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0},
   {X86::VPBLENDWrri, X86::VPBLENDWrmi, 0},
   {X86::VPBROADCASTBZ128rrkz, X86::VPBROADCASTBZ128rmkz, TB_NO_REVERSE},
diff --git a/llvm/test/ThinLTO/X86/summary-matching.ll b/llvm/test/ThinLTO/X86/summary-matching.ll
new file mode 100644
index 0000000000000..60dc51b965d5a
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/summary-matching.ll
@@ -0,0 +1,387 @@
+;; Test to make sure that function's definiton and summary matches.
+; RUN: split-file %s %t
+; RUN: opt -thinlto-bc %t/main.ll >%t/main.o
+; RUN: opt -thinlto-bc %t/b.ll >%t/b.o
+; RUN: opt -thinlto-bc %t/c.ll >%t/c.o
+
+; RUN: llvm-lto2 run %t/b.o %t/c.o %t/main.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new -o %t/a.out \
+; RUN: -r=%t/main.o,main,plx \
+; RUN: -r=%t/b.o,_Z1bv,plx \
+; RUN: -r=%t/b.o,_Z3fooIiET_S0_S0_,plx \
+; RUN: -r=%t/b.o,_Znwm \
+; RUN: -r=%t/c.o,_Z1cv,plx \
+; RUN: -r=%t/c.o,_Z3fooIiET_S0_S0_ \
+; RUN: -r=%t/c.o,_Z3barIiET_S0_S0_,plx \
+; RUN: -r=%t/c.o,_Znwm \
+; RUN: -r=%t/main.o,_Z1bv \
+; RUN: -r=%t/main.o,_Z1cv \
+; RUN: -r=%t/main.o,_Z3fooIiET_S0_S0_ 
+
+;; foo has two copys:
+;; foo in b.ll is prevailing and inlines bar.
+;; foo in c.ll isn't prevailing and doesn't inline bar.
+;; main will import foo in c.ll and foo's summary in b.ll default.
+
+;--- main.ll
+; ModuleID = 'main.cc'
+source_filename = "main.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress norecurse uwtable
+define dso_local noundef i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  %call = call noundef i32 @_Z1bv(), !callsite !6
+  %call1 = call noundef i32 @_Z1cv(), !callsite !7
+  %add = add nsw i32 %call, %call1
+  %call2 = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef 1, i32 noundef 2), !callsite !8
+  %add3 = add nsw i32 %add, %call2
+  ret i32 %add3
+}
+
+declare noundef i32 @_Z1bv() #1
+
+declare noundef i32 @_Z1cv() #1
+
+declare noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef, i32 noundef) #1
+
+attributes #0 = { mustprogress norecurse uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 1}
+!7 = !{i64 5}
+!8 = !{i64 7}
+
+;--- c.ll
+; ModuleID = 'c.cc'
+source_filename = "c.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooIiET_S0_S0_ = comdat any
+
+$_Z3barIiET_S0_S0_ = comdat any
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef i32 @_Z1cv() #0 {
+entry:
+  %num1 = alloca i32, align 4
+  %num2 = alloca i32, align 4
+  store i32 1, ptr %num1, align 4
+  store i32 1, ptr %num2, align 4
+  %0 = load i32, ptr %num1, align 4
+  %1 = load i32, ptr %num2, align 4
+  %call = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !6
+  ret i32 %call
+}
+
+; Function Attrs: mustprogress uwtable
+define linkonce_odr dso_local noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #3 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %rtn = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  %call = call noundef i32 @_Z3barIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !7
+  store i32 %call, ptr %rtn, align 4
+  %2 = load i32, ptr %rtn, align 4
+  ret i32 %2
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define linkonce_odr dso_local noundef i32 @_Z3barIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #0 comdat {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c = alloca ptr, align 8
+  %d = alloca ptr, align 8
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %add = add nsw i32 %0, 1
+  store i32 %add, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  %add1 = add nsw i32 %1, 1
+  store i32 %add1, ptr %b.addr, align 4
+  %2 = load i32, ptr %a.addr, align 4
+  %add2 = add nsw i32 %2, 1
+  store i32 %add2, ptr %a.addr, align 4
+  %3 = load i32, ptr %b.addr, align 4
+  %add3 = add nsw i32 %3, 1
+  store i32 %add3, ptr %b.addr, align 4
+  %4 = load i32, ptr %a.addr, align 4
+  %add4 = add nsw i32 %4, 1
+  store i32 %add4, ptr %a.addr, align 4
+  %5 = load i32, ptr %b.addr, align 4
+  %add5 = add nsw i32 %5, 1
+  store i32 %add5, ptr %b.addr, align 4
+  %6 = load i32, ptr %a.addr, align 4
+  %add6 = add nsw i32 %6, 1
+  store i32 %add6, ptr %a.addr, align 4
+  %7 = load i32, ptr %b.addr, align 4
+  %add7 = add nsw i32 %7, 1
+  store i32 %add7, ptr %b.addr, align 4
+  %8 = load i32, ptr %a.addr, align 4
+  %add8 = add nsw i32 %8, 1
+  store i32 %add8, ptr %a.addr, align 4
+  %9 = load i32, ptr %b.addr, align 4
+  %add9 = add nsw i32 %9, 1
+  store i32 %add9, ptr %b.addr, align 4
+  %10 = load i32, ptr %a.addr, align 4
+  %add10 = add nsw i32 %10, 1
+  store i32 %add10, ptr %a.addr, align 4
+  %11 = load i32, ptr %b.addr, align 4
+  %add11 = add nsw i32 %11, 1
+  store i32 %add11, ptr %b.addr, align 4
+  %12 = load i32, ptr %a.addr, align 4
+  %add12 = add nsw i32 %12, 1
+  store i32 %add12, ptr %a.addr, align 4
+  %13 = load i32, ptr %b.addr, align 4
+  %add13 = add nsw i32 %13, 1
+  store i32 %add13, ptr %b.addr, align 4
+  %14 = load i32, ptr %a.addr, align 4
+  %add14 = add nsw i32 %14, 1
+  store i32 %add14, ptr %a.addr, align 4
+  %15 = load i32, ptr %b.addr, align 4
+  %add15 = add nsw i32 %15, 1
+  store i32 %add15, ptr %b.addr, align 4
+  %16 = load i32, ptr %a.addr, align 4
+  %add16 = add nsw i32 %16, 1
+  store i32 %add16, ptr %a.addr, align 4
+  %17 = load i32, ptr %b.addr, align 4
+  %add17 = add nsw i32 %17, 1
+  store i32 %add17, ptr %b.addr, align 4
+  %18 = load i32, ptr %a.addr, align 4
+  %add18 = add nsw i32 %18, 1
+  store i32 %add18, ptr %a.addr, align 4
+  %19 = load i32, ptr %b.addr, align 4
+  %add19 = add nsw i32 %19, 1
+  store i32 %add19, ptr %b.addr, align 4
+  %20 = load i32, ptr %a.addr, align 4
+  %add20 = add nsw i32 %20, 1
+  store i32 %add20, ptr %a.addr, align 4
+  %21 = load i32, ptr %b.addr, align 4
+  %add21 = add nsw i32 %21, 1
+  store i32 %add21, ptr %b.addr, align 4
+  %22 = load i32, ptr %a.addr, align 4
+  %add22 = add nsw i32 %22, 1
+  store i32 %add22, ptr %a.addr, align 4
+  %23 = load i32, ptr %b.addr, align 4
+  %add23 = add nsw i32 %23, 1
+  store i32 %add23, ptr %b.addr, align 4
+  %call = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !8
+  store i32 1, ptr %call, align 4
+  store ptr %call, ptr %c, align 8
+  %call24 = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !9
+  store i32 1, ptr %call24, align 4
+  store ptr %call24, ptr %d, align 8
+  %24 = load i32, ptr %a.addr, align 4
+  %25 = load i32, ptr %b.addr, align 4
+  %cmp = icmp sgt i32 %24, %25
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  %26 = load i32, ptr %a.addr, align 4
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %27 = load i32, ptr %b.addr, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %26, %cond.true ], [ %27, %cond.false ]
+  ret i32 %cond
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znwm(i64 noundef) #1
+
+attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 6}
+!7 = !{i64 3}
+!8 = !{i64 4}
+!9 = !{i64 9}
+
+;--- b.ll
+; ModuleID = 'b.cc'
+source_filename = "b.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$_Z3fooIiET_S0_S0_ = comdat any
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef i32 @_Z1bv() #0 {
+entry:
+  %num1 = alloca i32, align 4
+  %num2 = alloca i32, align 4
+  store i32 0, ptr %num1, align 4
+  store i32 0, ptr %num2, align 4
+  %0 = load i32, ptr %num1, align 4
+  %1 = load i32, ptr %num2, align 4
+  %call = call noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %0, i32 noundef %1), !callsite !6
+  ret i32 %call
+}
+
+; Function Attrs: mustprogress uwtable
+define linkonce_odr dso_local noundef i32 @_Z3fooIiET_S0_S0_(i32 noundef %a, i32 noundef %b) #3 comdat {
+entry:
+  %a.addr.i = alloca i32, align 4
+  %b.addr.i = alloca i32, align 4
+  %c.i = alloca ptr, align 8
+  %d.i = alloca ptr, align 8
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %rtn = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store i32 %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %b.addr, align 4
+  store i32 %0, ptr %a.addr.i, align 4
+  store i32 %1, ptr %b.addr.i, align 4
+  %2 = load i32, ptr %a.addr.i, align 4
+  %add.i = add nsw i32 %2, 1
+  store i32 %add.i, ptr %a.addr.i, align 4
+  %3 = load i32, ptr %b.addr.i, align 4
+  %add1.i = add nsw i32 %3, 1
+  store i32 %add1.i, ptr %b.addr.i, align 4
+  %4 = load i32, ptr %a.addr.i, align 4
+  %add2.i = add nsw i32 %4, 1
+  store i32 %add2.i, ptr %a.addr.i, align 4
+  %5 = load i32, ptr %b.addr.i, align 4
+  %add3.i = add nsw i32 %5, 1
+  store i32 %add3.i, ptr %b.addr.i, align 4
+  %6 = load i32, ptr %a.addr.i, align 4
+  %add4.i = add nsw i32 %6, 1
+  store i32 %add4.i, ptr %a.addr.i, align 4
+  %7 = load i32, ptr %b.addr.i, align 4
+  %add5.i = add nsw i32 %7, 1
+  store i32 %add5.i, ptr %b.addr.i, align 4
+  %8 = load i32, ptr %a.addr.i, align 4
+  %add6.i = add nsw i32 %8, 1
+  store i32 %add6.i, ptr %a.addr.i, align 4
+  %9 = load i32, ptr %b.addr.i, align 4
+  %add7.i = add nsw i32 %9, 1
+  store i32 %add7.i, ptr %b.addr.i, align 4
+  %10 = load i32, ptr %a.addr.i, align 4
+  %add8.i = add nsw i32 %10, 1
+  store i32 %add8.i, ptr %a.addr.i, align 4
+  %11 = load i32, ptr %b.addr.i, align 4
+  %add9.i = add nsw i32 %11, 1
+  store i32 %add9.i, ptr %b.addr.i, align 4
+  %12 = load i32, ptr %a.addr.i, align 4
+  %add10.i = add nsw i32 %12, 1
+  store i32 %add10.i, ptr %a.addr.i, align 4
+  %13 = load i32, ptr %b.addr.i, align 4
+  %add11.i = add nsw i32 %13, 1
+  store i32 %add11.i, ptr %b.addr.i, align 4
+  %14 = load i32, ptr %a.addr.i, align 4
+  %add12.i = add nsw i32 %14, 1
+  store i32 %add12.i, ptr %a.addr.i, align 4
+  %15 = load i32, ptr %b.addr.i, align 4
+  %add13.i = add nsw i32 %15, 1
+  store i32 %add13.i, ptr %b.addr.i, align 4
+  %16 = load i32, ptr %a.addr.i, align 4
+  %add14.i = add nsw i32 %16, 1
+  store i32 %add14.i, ptr %a.addr.i, align 4
+  %17 = load i32, ptr %b.addr.i, align 4
+  %add15.i = add nsw i32 %17, 1
+  store i32 %add15.i, ptr %b.addr.i, align 4
+  %18 = load i32, ptr %a.addr.i, align 4
+  %add16.i = add nsw i32 %18, 1
+  store i32 %add16.i, ptr %a.addr.i, align 4
+  %19 = load i32, ptr %b.addr.i, align 4
+  %add17.i = add nsw i32 %19, 1
+  store i32 %add17.i, ptr %b.addr.i, align 4
+  %20 = load i32, ptr %a.addr.i, align 4
+  %add18.i = add nsw i32 %20, 1
+  store i32 %add18.i, ptr %a.addr.i, align 4
+  %21 = load i32, ptr %b.addr.i, align 4
+  %add19.i = add nsw i32 %21, 1
+  store i32 %add19.i, ptr %b.addr.i, align 4
+  %22 = load i32, ptr %a.addr.i, align 4
+  %add20.i = add nsw i32 %22, 1
+  store i32 %add20.i, ptr %a.addr.i, align 4
+  %23 = load i32, ptr %b.addr.i, align 4
+  %add21.i = add nsw i32 %23, 1
+  store i32 %add21.i, ptr %b.addr.i, align 4
+  %24 = load i32, ptr %a.addr.i, align 4
+  %add22.i = add nsw i32 %24, 1
+  store i32 %add22.i, ptr %a.addr.i, align 4
+  %25 = load i32, ptr %b.addr.i, align 4
+  %add23.i = add nsw i32 %25, 1
+  store i32 %add23.i, ptr %b.addr.i, align 4
+  %call.i = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !7
+  store i32 1, ptr %call.i, align 4
+  store ptr %call.i, ptr %c.i, align 8
+  %call24.i = call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) #2, !callsite !8
+  store i32 1, ptr %call24.i, align 4
+  store ptr %call24.i, ptr %d.i, align 8
+  %26 = load i32, ptr %a.addr.i, align 4
+  %27 = load i32, ptr %b.addr.i, align 4
+  %cmp.i = icmp sgt i32 %26, %27
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %entry
+  %28 = load i32, ptr %a.addr.i, align 4
+  br label %_Z3barIiET_S0_S0_.exit
+
+cond.false.i:                                     ; preds = %entry
+  %29 = load i32, ptr %b.addr.i, align 4
+  br label %_Z3barIiET_S0_S0_.exit
+
+_Z3barIiET_S0_S0_.exit:                           ; preds = %cond.true.i, %cond.false.i
+  %cond.i = phi i32 [ %28, %cond.true.i ], [ %29, %cond.false.i ]
+  store i32 %cond.i, ptr %rtn, align 4
+  %30 = load i32, ptr %rtn, align 4
+  ret i32 %30
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znwm(i64 noundef) #1
+
+attributes #0 = { mustprogress noinline optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!3 = !{i32 7, !"uwtable", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 19.0.0"}
+!6 = !{i64 2}
+!7 = !{i64 4, i64 3}
+!8 = !{i64 9, i64 3}
diff --git a/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
new file mode 100644
index 0000000000000..1e1669b29b0db
--- /dev/null
+++ b/llvm/test/Transforms/ArgumentPromotion/aliasing-and-non-aliasing-loads-with-clobber.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p argpromotion -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+@f = dso_local global { i16, i64 } { i16 1, i64 0 }, align 8
+
+; Test case for https://github.com/llvm/llvm-project/issues/84807.
+
+; Make sure the loads from @callee are not moved to @caller, as the store
+; in %then may aliases to load from %q.
+
+define i32 @caller1(i1 %c) {
+; CHECK-LABEL: define i32 @caller1(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee1(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee1(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee1(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee1(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+
+  uselistorder ptr %q, { 1, 0 }
+}
+
+; Same as @caller1/callee2, but with default uselist order.
+define i32 @caller2(i1 %c) {
+; CHECK-LABEL: define i32 @caller2(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @callee2(ptr noundef nonnull @f, i1 [[C]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  call void @callee2(ptr noundef nonnull @f, i1 %c)
+  ret i32 0
+}
+
+define internal void @callee2(ptr nocapture noundef readonly %q, i1 %c) {
+; CHECK-LABEL: define internal void @callee2(
+; CHECK-SAME: ptr nocapture noundef readonly [[Q:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    store i16 123, ptr @f, align 8
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[Q_0_VAL:%.*]] = load i16, ptr [[Q]], align 8
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[Q_8_VAL:%.*]] = load i64, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    call void @use(i16 [[Q_0_VAL]], i64 [[Q_8_VAL]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %c, label %then, label %exit
+
+then:
+  store i16 123, ptr @f, align 8
+  br label %exit
+
+exit:
+  %l.0 = load i16, ptr %q, align 8
+  %gep.8  = getelementptr inbounds i8, ptr %q, i64 8
+  %l.1 = load i64, ptr %gep.8, align 8
+  call void @use(i16 %l.0, i64 %l.1)
+  ret void
+}
+
+declare void @use(i16, i64)
diff --git a/llvm/test/Transforms/InstCombine/icmp-range.ll b/llvm/test/Transforms/InstCombine/icmp-range.ll
index 77bb5fdb6bfd4..9ed2f2a4860c6 100644
--- a/llvm/test/Transforms/InstCombine/icmp-range.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-range.ll
@@ -149,6 +149,16 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly
   ret i1 %rval
 }
 
+; Values' ranges overlap each other, so it can not be simplified.
+define i1 @test_two_attribute_ranges(i32 range(i32 5, 10) %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_attribute_ranges(
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[ARG1:%.*]], [[ARG2:%.*]]
+; CHECK-NEXT:    ret i1 [[RVAL]]
+;
+  %rval = icmp ult i32 %arg2, %arg1
+  ret i1 %rval
+}
+
 ; Values' ranges do not overlap each other, so it can simplified to false.
 define i1 @test_two_ranges2(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
 ; CHECK-LABEL: @test_two_ranges2(
@@ -160,6 +170,35 @@ define i1 @test_two_ranges2(ptr nocapture readonly %arg1, ptr nocapture readonly
   ret i1 %rval
 }
 
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_argument_ranges(i32 range(i32 1, 6) %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges(
+; CHECK-NEXT:    ret i1 false
+;
+  %rval = icmp ult i32 %arg2, %arg1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_range_and_one_argument_range(ptr nocapture readonly %arg1, i32 range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_one_range_and_one_argument_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = load i32, ptr %arg1, !range !0
+  %rval = icmp ult i32 %arg2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_argument_range_and_one_range(i32 range(i32 1, 6) %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_one_argument_range_and_one_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = load i32, ptr %arg2, !range !6
+  %rval = icmp ult i32 %val1, %arg1
+  ret i1 %rval
+}
+
 ; Values' ranges do not overlap each other, so it can simplified to true.
 define i1 @test_two_ranges3(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
 ; CHECK-LABEL: @test_two_ranges3(
@@ -186,8 +225,8 @@ define <2 x i1> @test_two_ranges_vec(ptr nocapture readonly %arg1, ptr nocapture
 }
 
 ; Values' ranges do not overlap each other, so it can simplified to false.
-define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
-; CHECK-LABEL: @test_two_ranges_vec_true(
+define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_two_ranges_vec_false(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %val1 = load <2 x i32>, ptr %arg1, !range !0
@@ -196,9 +235,9 @@ define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr noca
   ret <2 x i1> %rval
 }
 
-; Values' ranges do not overlap each other, so it can simplified to false.
-define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
-; CHECK-LABEL: @test_two_ranges_vec_false(
+; Values' ranges do not overlap each other, so it can simplified to true.
+define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr nocapture readonly %arg2) {
+; CHECK-LABEL: @test_two_ranges_vec_true(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %val1 = load <2 x i32>, ptr %arg1, !range !0
@@ -207,6 +246,101 @@ define <2 x i1> @test_two_ranges_vec_false(ptr nocapture readonly %arg1, ptr noc
   ret <2 x i1> %rval
 }
 
+; Values' ranges overlap each other, so it can not be simplified.
+define <2 x i1> @test_two_argument_ranges_vec(<2 x i32> range(i32 5, 10) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec(
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult <2 x i32> [[VAL2:%.*]], [[VAL1:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[RVAL]]
+;
+  %rval = icmp ult <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define <2 x i1> @test_two_argument_ranges_vec_false(<2 x i32> range(i32 1, 6) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec_false(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rval = icmp ult <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to true.
+define <2 x i1> @test_two_argument_ranges_vec_true(<2 x i32> range(i32 1, 6) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
+; CHECK-LABEL: @test_two_argument_ranges_vec_true(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %rval = icmp ugt <2 x i32> %arg2, %arg1
+  ret <2 x i1> %rval
+}
+
+declare i32 @create_range1()
+declare range(i32 8, 16) i32 @create_range2()
+declare range(i32 1, 6) i32 @create_range3()
+
+; Values' ranges overlap each other, so it can not be simplified.
+define i1 @test_two_return_attribute_ranges_not_simplified() {
+; CHECK-LABEL: @test_two_return_attribute_ranges_not_simplified(
+; CHECK-NEXT:    [[ARG2:%.*]] = call range(i32 5, 10) i32 @create_range1()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[ARG1]], [[ARG2]]
+; CHECK-NEXT:    ret i1 [[RVAL]]
+;
+  %val1 = call range(i32 5, 10) i32 @create_range1()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_return_attribute_ranges_one_in_call() {
+; CHECK-LABEL: @test_two_return_attribute_ranges_one_in_call(
+; CHECK-NEXT:    [[VAL1:%.*]] = call range(i32 1, 6) i32 @create_range1()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call range(i32 1, 6) i32 @create_range1()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_two_return_attribute_ranges() {
+; CHECK-LABEL: @test_two_return_attribute_ranges(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    [[ARG1:%.*]] = call i32 @create_range2()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %val2 = call i32 @create_range2()
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_return_argument_and_one_argument_range(i32 range(i32 8, 16) %arg1) {
+; CHECK-LABEL: @test_one_return_argument_and_one_argument_range(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %rval = icmp ult i32 %arg1, %val1
+  ret i1 %rval
+}
+
+; Values' ranges do not overlap each other, so it can simplified to false.
+define i1 @test_one_range_and_one_return_argument(ptr nocapture readonly %arg1) {
+; CHECK-LABEL: @test_one_range_and_one_return_argument(
+; CHECK-NEXT:    [[VAL1:%.*]] = call i32 @create_range3()
+; CHECK-NEXT:    ret i1 false
+;
+  %val1 = call i32 @create_range3()
+  %val2 = load i32, ptr %arg1, !range !6
+  %rval = icmp ult i32 %val2, %val1
+  ret i1 %rval
+}
+
 define i1 @ugt_zext(i1 %b, i8 %x) {
 ; CHECK-LABEL: @ugt_zext(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[X:%.*]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
new file mode 100644
index 0000000000000..14acb6f57aa0c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR 81872 explains the issue.
+
+; If we vectorize, we have a miscompile where array IV and thereby value stored in (arr[99],
+; arr[98]) is calculated incorrectly since disjoint or was only disjoint because
+; of dominating conditions. Dropping the disjoint to avoid poison still changes
+; the behaviour since now the or is no longer equivalent to the add.
+;
+define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noundef align 8 dereferenceable_or_null(16) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb5:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 99, i64 98, i64 97, i64 96>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 99, [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[ARR]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 -3
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i1> [[TMP4]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> <i64 1, i64 1, i64 1, i64 1>, ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 -4, i64 -4, i64 -4, i64 -4>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[BB6:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB5:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
+; CHECK-NEXT:    [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
+; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]]
+; CHECK:       bb18:
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
+; CHECK-NEXT:    [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
+; CHECK-NEXT:    store i64 1, ptr [[GETELEMENTPTR19]], align 8
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
+; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       bb6:
+; CHECK-NEXT:    ret void
+;
+bb5:
+  br label %loop.header
+
+loop.header:                                             ; preds = %loop.latch, %bb8
+  %iv = phi i64 [ 99, %bb5 ], [ %iv.next, %loop.latch ]
+  %and = and i64 %iv, 1
+  %icmp17 = icmp eq i64 %and, 0
+  br i1 %icmp17, label %bb18, label %loop.latch, !prof !21
+
+bb18:                                             ; preds = %loop.header
+  %or = or disjoint i64 %iv, 1
+  %getelementptr19 = getelementptr inbounds i64, ptr %arr, i64 %or
+  store i64 1, ptr %getelementptr19, align 8
+  br label %loop.latch
+
+loop.latch:                                             ; preds = %bb18, %loop.header
+  %iv.next = add nsw i64 %iv, -1
+  %icmp22 = icmp eq i64 %iv.next, 90
+  br i1 %icmp22, label %bb6, label %loop.header, !prof !22
+
+bb6:
+  ret void
+}
+
+attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" }
+
+!4 = !{}
+!10 = !{i32 1}
+!16 = !{i64 864}
+!17 = !{i64 8}
+!21 = !{!"branch_weights", i32 1, i32 1}
+!22 = !{!"branch_weights", i32 1, i32 95}
+
+
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+;.
diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
index 842aab121b96f..7e03d689fdc9e 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
@@ -4,8 +4,9 @@
 define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
 ; CHECK-LABEL: @test_ult_254_inc_imm(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], -255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], -2
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
index 377708cf71134..78c5e7323ceab 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
@@ -89,8 +89,9 @@ define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
 
 define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_add_positive_const_limit(
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], -255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
@@ -144,8 +145,9 @@ define i32 @safe_add_underflow_neg(i8 zeroext %a) {
 
 define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
 ; CHECK-LABEL: @overflow_sub_negative_const_limit(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 255
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], -128
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
index e467c4e48ebd2..f184d5579d06e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s
@@ -1337,8 +1337,8 @@ vzeroupper
 # CHECK-NEXT:  1      1     0.33                        vmovups	%ymm0, %ymm2
 # CHECK-NEXT:  2      1     0.50           *            vmovups	%ymm0, (%rax)
 # CHECK-NEXT:  1      7     0.50    *                   vmovups	(%rax), %ymm2
-# CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  3      10    2.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  2      10    0.50    *                   vmulpd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      4     0.50                        vmulpd	%ymm0, %ymm1, %ymm2
@@ -1738,7 +1738,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     126.00 322.92 232.92 160.50 160.50 19.00  296.92 6.25   19.00  19.00  19.00
+# CHECK-NEXT:  -     126.00 322.92 233.92 160.50 160.50 19.00  295.92 6.25   19.00  19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2049,8 +2049,8 @@ vzeroupper
 # CHECK-NEXT:  -      -     0.33   0.33    -      -      -     0.33    -      -      -      -     vmovups	%ymm0, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50   vmovups	%ymm0, (%rax)
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovups	(%rax), %ymm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vmulpd	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
index 97f0d052f4552..dcf883445ba4e 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s
@@ -476,8 +476,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      7     0.50    *                   vinserti128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      7     0.50    *                   vmovntdqa	(%rax), %ymm0
-# CHECK-NEXT:  2      4     2.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      11    2.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      4     1.00                        vmpsadbw	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00    *                   vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsb	%ymm0, %ymm2
 # CHECK-NEXT:  2      8     0.50    *                   vpabsb	(%rax), %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpabsd	%ymm0, %ymm2
@@ -778,7 +778,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     110.33 103.33 98.00  98.00  2.50   150.33  -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     110.33 104.33 98.00  98.00  2.50   149.33  -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -798,8 +798,8 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     vinserti128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.33   0.33   0.50   0.50    -     0.33    -      -      -      -     vinserti128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovntdqa	(%rax), %ymm0
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     vmpsadbw	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     vmpsadbw	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsb	%ymm0, %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vpabsb	(%rax), %ymm2
 # CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vpabsd	%ymm0, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
index 554d7aad54bad..05c208b1c622b 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse41.s
@@ -172,8 +172,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     1.00                        insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  2      7     1.00    *                   insertps	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      6     0.50    *                   movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  2      4     2.00                        mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  3      10    2.00    *                   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  3      10    1.00    *                   mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        packusdw	%xmm0, %xmm2
 # CHECK-NEXT:  2      10    1.00    *                   packusdw	(%rax), %xmm2
 # CHECK-NEXT:  2      2     0.67                        pblendvb	%xmm0, %xmm0, %xmm2
@@ -268,7 +268,7 @@ roundss     $1, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -      -     36.67  41.67  22.00  22.00  2.50   53.67   -     2.50   2.50   2.50
+# CHECK-NEXT:  -      -     36.67  42.67  22.00  22.00  2.50   52.67   -     2.50   2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -289,8 +289,8 @@ roundss     $1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     insertps	$1, %xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     insertps	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -     movntdqa	(%rax), %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -     2.00    -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -     0.50   0.50    -     2.00    -      -      -      -     mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -     0.50    -      -      -     1.50    -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -     0.50   0.50   0.50    -     1.50    -      -      -      -     mpsadbw	$1, (%rax), %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -     packusdw	%xmm0, %xmm2
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     packusdw	(%rax), %xmm2
 # CHECK-NEXT:  -      -     0.67   0.67    -      -      -     0.67    -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index d15ff9dd51a4c..cece65974c013 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -871,25 +871,139 @@ TEST_F(IRBuilderTest, createFunction) {
 }
 
 TEST_F(IRBuilderTest, DIBuilder) {
-  IRBuilder<> Builder(BB);
-  DIBuilder DIB(*M);
-  auto File = DIB.createFile("F.CBL", "/");
-  auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
-                                  DIB.createFile("F.CBL", "/"), "llvm-cobol74",
-                                  true, "", 0);
-  auto Type = DIB.createSubroutineType(DIB.getOrCreateTypeArray(std::nullopt));
-  auto SP = DIB.createFunction(
-      CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
-      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
-  F->setSubprogram(SP);
-  AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
-  auto BarSP = DIB.createFunction(
-      CU, "bar", "", File, 1, Type, 1, DINode::FlagZero,
-      DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
-  auto BadScope = DIB.createLexicalBlockFile(BarSP, File, 0);
-  I->setDebugLoc(DILocation::get(Ctx, 2, 0, BadScope));
-  DIB.finalize();
-  EXPECT_TRUE(verifyModule(*M));
+  auto GetLastDbgRecord = [](const Instruction *I) -> DbgRecord * {
+    if (I->getDbgValueRange().empty())
+      return nullptr;
+    return &*std::prev(I->getDbgValueRange().end());
+  };
+
+  auto ExpectOrder = [&](DbgInstPtr First, BasicBlock::iterator Second) {
+    if (M->IsNewDbgInfoFormat) {
+      EXPECT_TRUE(First.is<DbgRecord *>());
+      EXPECT_FALSE(Second->getDbgValueRange().empty());
+      EXPECT_EQ(GetLastDbgRecord(&*Second), First.get<DbgRecord *>());
+    } else {
+      EXPECT_TRUE(First.is<Instruction *>());
+      EXPECT_EQ(&*std::prev(Second), First.get<Instruction *>());
+    }
+  };
+
+  auto RunTest = [&]() {
+    IRBuilder<> Builder(BB);
+    DIBuilder DIB(*M);
+    auto File = DIB.createFile("F.CBL", "/");
+    auto CU = DIB.createCompileUnit(dwarf::DW_LANG_Cobol74,
+                                    DIB.createFile("F.CBL", "/"),
+                                    "llvm-cobol74", true, "", 0);
+    auto Type =
+        DIB.createSubroutineType(DIB.getOrCreateTypeArray(std::nullopt));
+    auto SP = DIB.createFunction(
+        CU, "foo", "", File, 1, Type, 1, DINode::FlagZero,
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
+    F->setSubprogram(SP);
+    AllocaInst *I = Builder.CreateAlloca(Builder.getInt8Ty());
+    auto BarSP = DIB.createFunction(
+        CU, "bar", "", File, 1, Type, 1, DINode::FlagZero,
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
+    auto BarScope = DIB.createLexicalBlockFile(BarSP, File, 0);
+    I->setDebugLoc(DILocation::get(Ctx, 2, 0, BarScope));
+
+    // Create another instruction so that there's one before the alloca we're
+    // inserting debug intrinsics before, to make end-checking easier.
+    I = Builder.CreateAlloca(Builder.getInt1Ty());
+
+    // Label metadata and records
+    // --------------------------
+    DILocation *LabelLoc = DILocation::get(Ctx, 1, 0, BarScope);
+    DILabel *AlwaysPreserveLabel = DIB.createLabel(
+        BarScope, "meles_meles", File, 1, /*AlwaysPreserve*/ true);
+    DILabel *Label =
+        DIB.createLabel(BarScope, "badger", File, 1, /*AlwaysPreserve*/ false);
+
+    { /* dbg.label | DPLabel */
+      // Insert before I and check order.
+      ExpectOrder(DIB.insertLabel(Label, LabelLoc, I), I->getIterator());
+
+      // We should be able to insert at the end of the block, even if there's
+      // no terminator yet. Note that in RemoveDIs mode this record won't get
+      // inserted into the block untill another instruction is added.
+      DbgInstPtr LabelRecord = DIB.insertLabel(Label, LabelLoc, BB);
+      // Specifically do not insert a terminator, to check this works. `I`
+      // should have absorbed the DPLabel in the new debug info mode.
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(LabelRecord, I->getIterator());
+    }
+
+    // Variable metadata and records
+    // -----------------------------
+    DILocation *VarLoc = DILocation::get(Ctx, 2, 0, BarScope);
+    auto *IntType = DIB.createBasicType("int", 32, dwarf::DW_ATE_signed);
+    DILocalVariable *VarX =
+        DIB.createAutoVariable(BarSP, "X", File, 2, IntType, true);
+    DILocalVariable *VarY =
+        DIB.createAutoVariable(BarSP, "Y", File, 2, IntType, true);
+    { /* dbg.value | DPValue::Value */
+      ExpectOrder(DIB.insertDbgValueIntrinsic(I, VarX, DIB.createExpression(),
+                                              VarLoc, I),
+                  I->getIterator());
+      // Check inserting at end of the block works as with labels.
+      DbgInstPtr VarXValue = DIB.insertDbgValueIntrinsic(
+          I, VarX, DIB.createExpression(), VarLoc, BB);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarXValue, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+    { /* dbg.declare | DPValue::Declare */
+      ExpectOrder(DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, I),
+                  I->getIterator());
+      // Check inserting at end of the block works as with labels.
+      DbgInstPtr VarYDeclare =
+          DIB.insertDeclare(I, VarY, DIB.createExpression(), VarLoc, BB);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarYDeclare, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+    { /* dbg.assign | DPValue::Assign */
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      I->setMetadata(LLVMContext::MD_DIAssignID, DIAssignID::getDistinct(Ctx));
+      // DbgAssign interface is slightly different - it always inserts after the
+      // linked instr. Check we can do this with no instruction to insert
+      // before.
+      DbgInstPtr VarXAssign =
+          DIB.insertDbgAssign(I, I, VarX, DIB.createExpression(), I,
+                              DIB.createExpression(), VarLoc);
+      I = Builder.CreateAlloca(Builder.getInt32Ty());
+      ExpectOrder(VarXAssign, I->getIterator());
+      EXPECT_EQ(BB->getTrailingDPValues(), nullptr);
+    }
+
+    Builder.CreateRet(nullptr);
+    DIB.finalize();
+    // Check the labels are not/are added to Bar's retainedNodes array
+    // (AlwaysPreserve).
+    EXPECT_EQ(find(BarSP->getRetainedNodes(), Label),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), AlwaysPreserveLabel),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), VarX),
+              BarSP->getRetainedNodes().end());
+    EXPECT_NE(find(BarSP->getRetainedNodes(), VarY),
+              BarSP->getRetainedNodes().end());
+    EXPECT_TRUE(verifyModule(*M));
+  };
+
+  // Test in old-debug mode.
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  RunTest();
+
+  // Test in new-debug mode.
+  // Reset the test then call convertToNewDbgValues to flip the flag
+  // on the test's Module, Function and BasicBlock.
+  TearDown();
+  SetUp();
+  M->convertToNewDbgValues();
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
+  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index 658f3796721c4..7c183e9626f98 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -516,6 +516,19 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return N1.lshr(N2);
       },
       checkOptimalityBinary, /* RefinePoisonToZero */ true);
+  testBinaryOpExhaustive(
+      [](const KnownBits &Known1, const KnownBits &Known2) {
+        return KnownBits::lshr(Known1, Known2, /*ShAmtNonZero=*/false,
+                               /*Exact=*/true);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        if (N2.uge(N2.getBitWidth()))
+          return std::nullopt;
+        if (!N1.extractBits(N2.getZExtValue(), 0).isZero())
+          return std::nullopt;
+        return N1.lshr(N2);
+      },
+      checkOptimalityBinary, /* RefinePoisonToZero */ true);
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {
         return KnownBits::ashr(Known1, Known2);
@@ -526,6 +539,19 @@ TEST(KnownBitsTest, BinaryExhaustive) {
         return N1.ashr(N2);
       },
       checkOptimalityBinary, /* RefinePoisonToZero */ true);
+  testBinaryOpExhaustive(
+      [](const KnownBits &Known1, const KnownBits &Known2) {
+        return KnownBits::ashr(Known1, Known2, /*ShAmtNonZero=*/false,
+                               /*Exact=*/true);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        if (N2.uge(N2.getBitWidth()))
+          return std::nullopt;
+        if (!N1.extractBits(N2.getZExtValue(), 0).isZero())
+          return std::nullopt;
+        return N1.ashr(N2);
+      },
+      checkOptimalityBinary, /* RefinePoisonToZero */ true);
 
   testBinaryOpExhaustive(
       [](const KnownBits &Known1, const KnownBits &Known2) {
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index b2e6843eb9af1..b21f14eca4450 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -586,7 +586,7 @@ def create_pull_request(
                 body=body,
                 base=release_branch_for_issue,
                 head=head,
-                maintainer_can_modify=False,
+                maintainer_can_modify=True,
             )
 
             pull.as_issue().edit(milestone=self.issue.milestone)
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 65cf19e7a4fcd..c64ecb79c5ca5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -481,6 +481,10 @@ struct ControlDropUnitDims {
     if (auto genericOp = dyn_cast_or_null<GenericOp>(op)) {
       return llvm::to_vector(llvm::seq<unsigned>(0, genericOp.getNumLoops()));
     }
+    if (auto padOp = dyn_cast_or_null<tensor::PadOp>(op)) {
+      return llvm::to_vector(
+          llvm::seq<unsigned>(0, padOp.getSourceType().getRank()));
+    }
     return SmallVector<unsigned>{};
   };
 };
diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index c71517666b609..39e66cd9e6e5a 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -641,7 +641,7 @@ def MemRef_DmaStartOp : MemRef_Op<"dma_start"> {
   let summary = "non-blocking DMA operation that starts a transfer";
   let description = [{
     Syntax:
-    
+
     ```
     operation ::= `memref.dma_start` ssa-use`[`ssa-use-list`]` `,`
                    ssa-use`[`ssa-use-list`]` `,` ssa-use `,`
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 1c61ece2676a9..670202fe4372e 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1098,21 +1098,18 @@ class Tensor_ReassociativeReshapeOp<string mnemonic, list<Trait> traits = []> :
 def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
   let summary = "operation to produce a tensor with a higher rank";
   let description = [{
-    The `tensor.expand_shape` op produces a new tensor with a higher
-    rank whose sizes are a reassociation of the original `src`.
+    The `tensor.expand_shape` op produces a tensor of higher (or equal)
+    rank than the operand `src` whose dimension sizes are a reassociation of
+    `src`.
 
-    A reassociation is defined as a continuous grouping of dimensions and is
-    represented with an array of DenseI64ArrayAttr attribute.
-
-    The verification rule is that the reassociation maps are applied to the
-    result tensor with the higher rank to obtain the operand tensor with the
-    smaller rank.
+    A reassociation is defined as a continuous grouping of dimensions. It is
+    represented with an array of DenseI64ArrayAttr attribute. Entries in the
+    array are referred to as reassociation maps.
 
-    The operand tensor type of a reshape can be zero-ranked if the result
-    tensor type is statically shaped with all dimensions being unit extent. In
-    such cases the reassociation map is empty.
+    The reassociation maps are applied to the result shape to obtain the operand
+    shape.
 
-    Examples:
+    Example:
 
     ```mlir
     // Dimension expansion i -> (i', j') and (k) -> (k')
@@ -1150,21 +1147,15 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
 def Tensor_CollapseShapeOp : Tensor_ReassociativeReshapeOp<"collapse_shape"> {
   let summary = "operation to produce a tensor with a smaller rank";
   let description = [{
-    The `tensor.collapse_shape` op produces a new tensor with a smaller
-    rank whose sizes are a reassociation of the original `src`.
+    The `tensor.collapse_shape` op produces a new tensor of lower (or equal)
+    rank whose dimension sizes are a reassociation of the original `src` dimensions.
 
     A reassociation is defined as a continuous grouping of dimensions and is
-    represented with an array of DenseI64ArrayAttr attribute.
+    represented by an array of DenseI64ArrayAttr attribute. The reassociation
+    maps are applied to the operand shape to obtain the result shape.
 
-    The verification rule is that the reassociation maps are applied to the
-    operand tensor with the higher rank to obtain the result tensor with the
-    smaller rank.
 
-    The result tensor type of a reshape can be zero-ranked if the operand
-    tensor type is statically shaped with all dimensions being unit extent. In
-    such case the reassociation map is empty.
-
-    Examples:
+    Example:
 
     ```mlir
     // Dimension collapse (i, j) -> i' and k -> k'
@@ -1841,7 +1832,7 @@ def Tensor_PackOp : Tensor_RelayoutOp<"pack", [
     and optionally transposes the tiled source tensor dimensions.
 
     `inner_dims_pos` (mandatory) specifies `k` source tensor dimensions that are
-    being tiled, where `0 < k <= n`. The order of the dimensions matters: 
+    being tiled, where `0 < k <= n`. The order of the dimensions matters:
      - The tiled dimensions (of size `inner_tiles`) are added to the end of the result
     tensor in the order in which they appear in `inner_dims_pos`.
      - `inner_dims_pos[i]` specifies the source tensor dimension tiled by
diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index 61c929dee0f27..ae9824f728da4 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -85,16 +85,21 @@ bool isReassociationValid(ArrayRef<AffineMap> reassociation,
 template <typename ReshapeOpTy, typename InverseReshapeOpTy>
 static OpFoldResult foldReshapeOp(ReshapeOpTy reshapeOp,
                                   ArrayRef<Attribute> operands) {
-  // Fold producer-consumer reshape ops that where the operand type of the
+
+  if (reshapeOp.getSrcType() == reshapeOp.getType())
+    return reshapeOp.getSrc();
+
+  // Fold producer-consumer reshape ops where the operand type of the
   // producer is same as the return type of the consumer.
   auto reshapeSrcOp =
       reshapeOp.getSrc().template getDefiningOp<InverseReshapeOpTy>();
   if (reshapeSrcOp && reshapeSrcOp.getSrcType() == reshapeOp.getResultType())
     return reshapeSrcOp.getSrc();
+
   // Reshape of a constant can be replaced with a new constant.
-  if (auto elements = dyn_cast_or_null<DenseElementsAttr>(operands.front())) {
+  if (auto elements = dyn_cast_or_null<DenseElementsAttr>(operands.front()))
     return elements.reshape(cast<ShapedType>(reshapeOp.getResult().getType()));
-  }
+
   return nullptr;
 }
 
@@ -103,41 +108,36 @@ static OpFoldResult foldReshapeOp(ReshapeOpTy reshapeOp,
 template <typename Op, typename T>
 static LogicalResult verifyReshapeLikeTypes(Op op, T expandedType,
                                             T collapsedType, bool isExpansion) {
+
   unsigned expandedRank = expandedType.getRank();
   unsigned collapsedRank = collapsedType.getRank();
   if (expandedRank < collapsedRank)
-    return op.emitOpError("expected the type ")
-           << expandedType
-           << " to have higher rank than the type = " << collapsedType;
-  if (expandedRank == 0)
-    return op.emitOpError("expected non-zero memref ranks");
-  if (expandedRank == collapsedRank)
-    return op.emitOpError("expected to collapse or expand dims");
-
-  if (collapsedRank == 0) {
-    // If collapsed rank is 0, then expanded type must be static shaped and of
-    // sizes 1.
-    if (llvm::any_of(expandedType.getShape(),
-                     [](int64_t dim) -> bool { return dim != 1; }))
-      return op.emitOpError("invalid to reshape tensor/memref with non-unit "
-                            "extent dimensions to zero-rank tensor/memref");
-    return success();
-  }
+    return op.emitOpError("expected the expanded type, ")
+           << expandedType << " to have a higher (or same) rank "
+           << "than the collapsed type, " << collapsedType << '.';
+
   if (collapsedRank != op.getReassociation().size())
-    return op.emitOpError("expected rank of the collapsed type(")
-           << collapsedRank << ") to be the number of reassociation maps("
-           << op.getReassociation().size() << ")";
+    return op.emitOpError("expected collapsed rank (")
+           << collapsedRank << ") to equal the number of reassociation maps ("
+           << op.getReassociation().size() << ").";
+
   auto maps = op.getReassociationMaps();
   for (auto it : llvm::enumerate(maps))
     if (it.value().getNumDims() != expandedRank)
       return op.emitOpError("expected reassociation map #")
-             << it.index() << " of same rank as expanded memref("
-             << expandedRank << "), but got " << it.value().getNumDims();
+             << it.index() << " to have size equal to the expanded rank ("
+             << expandedRank << "), but it is  " << it.value().getNumDims()
+             << '.';
+
   int invalidIdx = 0;
   if (!isReassociationValid(maps, &invalidIdx))
     return op.emitOpError("expected reassociation map #")
-           << invalidIdx << " to be valid and contiguous";
-  return verifyReshapeLikeShapes(op, collapsedType, expandedType, isExpansion);
+           << invalidIdx << " to be valid and contiguous.";
+
+  return reshapeLikeShapesAreCompatible(
+      [&](const Twine &msg) { return op->emitOpError(msg); },
+      collapsedType.getShape(), expandedType.getShape(),
+      op.getReassociationIndices(), isExpansion);
 }
 
 /// Verify that shapes of the reshaped types using following rules
@@ -153,16 +153,6 @@ LogicalResult reshapeLikeShapesAreCompatible(
     ArrayRef<int64_t> collapsedShape, ArrayRef<int64_t> expandedShape,
     ArrayRef<ReassociationIndices> reassociationMaps, bool isExpandingReshape);
 
-template <typename OpTy>
-static LogicalResult verifyReshapeLikeShapes(OpTy op, ShapedType collapsedType,
-                                             ShapedType expandedType,
-                                             bool isExpandingReshape) {
-  return reshapeLikeShapesAreCompatible(
-      [&](const Twine &msg) { return op->emitOpError(msg); },
-      collapsedType.getShape(), expandedType.getShape(),
-      op.getReassociationIndices(), isExpandingReshape);
-}
-
 /// Returns true iff the type is a MemRefType and has a non-identity layout.
 bool hasNonIdentityLayout(Type type);
 
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index 46bb733101c12..a89e13b625bf4 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -133,7 +133,8 @@ class Type {
   bool isF80() const;
   bool isF128() const;
 
-  /// Return true if this is an integer type with the specified width.
+  /// Return true if this is an integer type (with the specified width).
+  bool isInteger() const;
   bool isInteger(unsigned width) const;
   /// Return true if this is a signless integer type (with the specified width).
   bool isSignlessInteger() const;
diff --git a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
index 969058b022889..9ed8326a602e6 100644
--- a/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
+++ b/mlir/include/mlir/Tools/lsp-server-support/SourceMgrUtils.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
-#define MLIR_TOOLS_LSPSERVERSUPPORT_TRANSPORT_H
+#ifndef MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
+#define MLIR_TOOLS_LSPSERVERSUPPORT_SOURCEMGRUTILS_H
 
 #include "mlir/Tools/lsp-server-support/Protocol.h"
 #include "llvm/Support/SourceMgr.h"
diff --git a/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt b/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
index 2afff1a4e5f16..23174d1128719 100644
--- a/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
+++ b/mlir/lib/Conversion/NVVMToLLVM/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_conversion_library(MLIRNVVMToLLVM
   Core
 
   LINK_LIBS PUBLIC
+  MLIRFuncDialect
   MLIRGPUDialect
   MLIRLLVMCommonConversion
   MLIRLLVMDialect
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index 2274656e84a5c..a955d585b9a1d 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms
   MLIRAMDGPUUtils
   MLIRArithDialect
   MLIRControlFlowDialect
+  MLIRFuncDialect
   MLIRIR
   MLIRPass
   MLIRTransforms
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 45cab81be4f5f..023ea277bcf49 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -561,6 +561,126 @@ struct DropUnitDims : public OpRewritePattern<GenericOp> {
 };
 } // namespace
 
+//===---------------------------------------------------------------------===//
+// Drop dimensions that are unit-extents within tensor operations.
+//===---------------------------------------------------------------------===//
+
+namespace {
+struct DropPadUnitDims : public OpRewritePattern<tensor::PadOp> {
+  DropPadUnitDims(MLIRContext *context, ControlDropUnitDims options = {},
+                  PatternBenefit benefit = 1)
+      : OpRewritePattern(context, benefit), options(std::move(options)) {}
+
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
+                                PatternRewriter &rewriter) const override {
+    // 1a. Get the allowed list of dimensions to drop from the `options`.
+    SmallVector<unsigned> allowedUnitDims = options.controlFn(padOp);
+    if (allowedUnitDims.empty()) {
+      return rewriter.notifyMatchFailure(
+          padOp, "control function returns no allowed unit dims to prune");
+    }
+
+    if (padOp.getSourceType().getEncoding()) {
+      return rewriter.notifyMatchFailure(
+          padOp, "cannot collapse dims of tensor with encoding");
+    }
+
+    // Fail for non-constant padding values. The body of the pad could
+    // depend on the padding indices and/or properties of the padded
+    // tensor so for now we fail.
+    // TODO: Support non-constant padding values.
+    Value paddingVal = padOp.getConstantPaddingValue();
+    if (!paddingVal) {
+      return rewriter.notifyMatchFailure(
+          padOp, "unimplemented: non-constant padding value");
+    }
+
+    ArrayRef<int64_t> sourceShape = padOp.getSourceType().getShape();
+    int64_t padRank = sourceShape.size();
+
+    auto isStaticZero = [](OpFoldResult f) {
+      std::optional<int64_t> maybeInt = getConstantIntValue(f);
+      return maybeInt && *maybeInt == 0;
+    };
+
+    llvm::SmallDenseSet<unsigned> unitDimsFilter(allowedUnitDims.begin(),
+                                                 allowedUnitDims.end());
+    llvm::SmallDenseSet<unsigned> unitDims;
+    SmallVector<int64_t> newShape;
+    SmallVector<OpFoldResult> newLowPad;
+    SmallVector<OpFoldResult> newHighPad;
+    for (const auto [dim, size, low, high] :
+         zip_equal(llvm::seq(static_cast<int64_t>(0), padRank), sourceShape,
+                   padOp.getMixedLowPad(), padOp.getMixedHighPad())) {
+      if (unitDimsFilter.contains(dim) && size == 1 && isStaticZero(low) &&
+          isStaticZero(high)) {
+        unitDims.insert(dim);
+      } else {
+        newShape.push_back(size);
+        newLowPad.push_back(low);
+        newHighPad.push_back(high);
+      }
+    }
+
+    if (unitDims.empty()) {
+      return rewriter.notifyMatchFailure(padOp, "no unit dims to collapse");
+    }
+
+    ReassociationIndices reassociationGroup;
+    SmallVector<ReassociationIndices> reassociationMap;
+    int64_t dim = 0;
+    while (dim < padRank && unitDims.contains(dim))
+      reassociationGroup.push_back(dim++);
+    while (dim < padRank) {
+      assert(!unitDims.contains(dim) && "expected non unit-extent");
+      reassociationGroup.push_back(dim);
+      dim++;
+      // Fold all following dimensions that are unit-extent.
+      while (dim < padRank && unitDims.contains(dim))
+        reassociationGroup.push_back(dim++);
+      reassociationMap.push_back(reassociationGroup);
+      reassociationGroup.clear();
+    }
+
+    Value collapsedSource =
+        collapseValue(rewriter, padOp.getLoc(), padOp.getSource(), newShape,
+                      reassociationMap, options.rankReductionStrategy);
+
+    auto newPadOp = rewriter.create<tensor::PadOp>(
+        padOp.getLoc(), /*result=*/Type(), collapsedSource, newLowPad,
+        newHighPad, paddingVal, padOp.getNofold());
+
+    Value dest = padOp.getResult();
+    if (options.rankReductionStrategy ==
+        ControlDropUnitDims::RankReductionStrategy::ExtractInsertSlice) {
+      SmallVector<OpFoldResult> expandedSizes;
+      int64_t numUnitDims = 0;
+      for (auto dim : llvm::seq(static_cast<int64_t>(0), padRank)) {
+        if (unitDims.contains(dim)) {
+          expandedSizes.push_back(rewriter.getIndexAttr(1));
+          numUnitDims++;
+          continue;
+        }
+        expandedSizes.push_back(tensor::getMixedSize(
+            rewriter, padOp.getLoc(), newPadOp, dim - numUnitDims));
+      }
+      dest = rewriter.create<tensor::EmptyOp>(
+          padOp.getLoc(), expandedSizes,
+          padOp.getResultType().getElementType());
+    }
+
+    Value expandedValue =
+        expandValue(rewriter, padOp.getLoc(), newPadOp.getResult(), dest,
+                    reassociationMap, options.rankReductionStrategy);
+    rewriter.replaceOp(padOp, expandedValue);
+    return success();
+  }
+
+private:
+  ControlDropUnitDims options;
+};
+} // namespace
+
 namespace {
 /// Convert `extract_slice` operations to rank-reduced versions.
 struct RankReducedExtractSliceOp
@@ -640,6 +760,7 @@ populateFoldUnitExtentDimsViaReshapesPatterns(RewritePatternSet &patterns,
                                               ControlDropUnitDims &options) {
   auto *context = patterns.getContext();
   patterns.add<DropUnitDims>(context, options);
+  patterns.add<DropPadUnitDims>(context, options);
   // TODO: Patterns unrelated to unit dim folding should be factored out.
   patterns.add<RankReducedExtractSliceOp,
                RankReducedInsertSliceOp<tensor::InsertSliceOp>,
@@ -661,6 +782,7 @@ populateFoldUnitExtentDimsViaSlicesPatterns(RewritePatternSet &patterns,
   options.rankReductionStrategy =
       ControlDropUnitDims::RankReductionStrategy::ExtractInsertSlice;
   patterns.add<DropUnitDims>(context, options);
+  patterns.add<DropPadUnitDims>(context, options);
   // TODO: Patterns unrelated to unit dim folding should be factored out.
   linalg::FillOp::getCanonicalizationPatterns(patterns, context);
   tensor::EmptyOp::getCanonicalizationPatterns(patterns, context);
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 248193481acfc..836dcb8f329e7 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1080,7 +1080,37 @@ struct DimOfMemRefReshape : public OpRewritePattern<DimOp> {
     auto reshape = dim.getSource().getDefiningOp<ReshapeOp>();
 
     if (!reshape)
-      return failure();
+      return rewriter.notifyMatchFailure(
+          dim, "Dim op is not defined by a reshape op.");
+
+    // dim of a memref reshape can be folded if dim.getIndex() dominates the
+    // reshape. Instead of using `DominanceInfo` (which is usually costly) we
+    // cheaply check that either of the following conditions hold:
+    //      1. dim.getIndex() is defined in the same block as reshape but before
+    //      reshape.
+    //      2. dim.getIndex() is defined in a parent block of
+    //      reshape.
+
+    // Check condition 1
+    if (dim.getIndex().getParentBlock() == reshape->getBlock()) {
+      if (auto *definingOp = dim.getIndex().getDefiningOp()) {
+        if (reshape->isBeforeInBlock(definingOp)) {
+          return rewriter.notifyMatchFailure(
+              dim,
+              "dim.getIndex is not defined before reshape in the same block.");
+        }
+      } // else dim.getIndex is a block argument to reshape->getBlock and
+        // dominates reshape
+    }   // Check condition 2
+    else if (dim->getBlock() != reshape->getBlock() &&
+             !dim.getIndex().getParentRegion()->isProperAncestor(
+                 reshape->getParentRegion())) {
+      // If dim and reshape are in the same block but dim.getIndex() isn't, we
+      // already know dim.getIndex() dominates reshape without calling
+      // `isProperAncestor`
+      return rewriter.notifyMatchFailure(
+          dim, "dim.getIndex does not dominate reshape.");
+    }
 
     // Place the load directly after the reshape to ensure that the shape memref
     // was not mutated.
@@ -2224,9 +2254,13 @@ LogicalResult ExpandShapeOp::verify() {
   MemRefType srcType = getSrcType();
   MemRefType resultType = getResultType();
 
-  if (srcType.getRank() >= resultType.getRank())
-    return emitOpError("expected rank expansion, but found source rank ")
-           << srcType.getRank() << " >= result rank " << resultType.getRank();
+  if (srcType.getRank() > resultType.getRank()) {
+    auto r0 = srcType.getRank();
+    auto r1 = resultType.getRank();
+    return emitOpError("has source rank ")
+           << r0 << " and result rank " << r1 << ". This is not an expansion ("
+           << r0 << " > " << r1 << ").";
+  }
 
   // Verify result shape.
   if (failed(verifyCollapsedShape(getOperation(), srcType.getShape(),
@@ -2378,9 +2412,13 @@ LogicalResult CollapseShapeOp::verify() {
   MemRefType srcType = getSrcType();
   MemRefType resultType = getResultType();
 
-  if (srcType.getRank() <= resultType.getRank())
-    return emitOpError("expected rank reduction, but found source rank ")
-           << srcType.getRank() << " <= result rank " << resultType.getRank();
+  if (srcType.getRank() < resultType.getRank()) {
+    auto r0 = srcType.getRank();
+    auto r1 = resultType.getRank();
+    return emitOpError("has source rank ")
+           << r0 << " and result rank " << r1 << ". This is not a collapse ("
+           << r0 << " < " << r1 << ").";
+  }
 
   // Verify result shape.
   if (failed(verifyCollapsedShape(getOperation(), resultType.getShape(),
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 40b4837484a13..57a6d3445c151 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_dialect_library(MLIROpenMPDialect
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
 
   DEPENDS
+  omp_gen
   MLIROpenMPOpsIncGen
   MLIROpenMPOpsInterfacesIncGen
   MLIROpenMPTypeInterfacesIncGen
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index fe2f250e6b929..dc8843aa4e1e1 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -824,11 +824,37 @@ struct DimOfDestStyleOp : public OpRewritePattern<DimOp> {
     return success();
   }
 };
+
+/// Fold dim of a tensor reshape operation to a extract into the reshape's shape
+/// operand.
+struct DimOfReshapeOp : public OpRewritePattern<DimOp> {
+  using OpRewritePattern<DimOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DimOp dim,
+                                PatternRewriter &rewriter) const override {
+    auto reshape = dim.getSource().getDefiningOp<ReshapeOp>();
+
+    if (!reshape)
+      return failure();
+
+    // Since tensors are immutable we don't need to worry about where to place
+    // the extract call
+    rewriter.setInsertionPointAfter(dim);
+    Location loc = dim.getLoc();
+    Value extract =
+        rewriter.create<ExtractOp>(loc, reshape.getShape(), dim.getIndex());
+    if (extract.getType() != dim.getType())
+      extract =
+          rewriter.create<arith::IndexCastOp>(loc, dim.getType(), extract);
+    rewriter.replaceOp(dim, extract);
+    return success();
+  }
+};
 } // namespace
 
 void DimOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.add<DimOfCastOp, DimOfDestStyleOp>(context);
+  results.add<DimOfCastOp, DimOfDestStyleOp, DimOfReshapeOp>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1656,22 +1682,10 @@ static LogicalResult verifyTensorReshapeOp(TensorReshapeOp op,
 }
 
 LogicalResult ExpandShapeOp::verify() {
-  auto srcType = getSrcType();
-  auto resultType = getResultType();
-  if (srcType.getRank() >= resultType.getRank())
-    return emitOpError("expected rank expansion, but found source rank ")
-           << srcType.getRank() << " >= result rank " << resultType.getRank();
-
   return verifyTensorReshapeOp(*this, getResultType(), getSrcType());
 }
 
 LogicalResult CollapseShapeOp::verify() {
-  auto srcType = getSrcType();
-  auto resultType = getResultType();
-  if (srcType.getRank() <= resultType.getRank())
-    return emitOpError("expected rank reduction, but found source rank ")
-           << srcType.getRank() << " <= result rank " << resultType.getRank();
-
   return verifyTensorReshapeOp(*this, getSrcType(), getResultType());
 }
 
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
index 32dfef9e81049..1d1ba6df4db2f 100644
--- a/mlir/lib/IR/Types.cpp
+++ b/mlir/lib/IR/Types.cpp
@@ -55,6 +55,8 @@ bool Type::isF128() const { return llvm::isa<Float128Type>(*this); }
 
 bool Type::isIndex() const { return llvm::isa<IndexType>(*this); }
 
+bool Type::isInteger() const { return llvm::isa<IntegerType>(*this); }
+
 /// Return true if this is an integer type with the specified width.
 bool Type::isInteger(unsigned width) const {
   if (auto intTy = llvm::dyn_cast<IntegerType>(*this))
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index cd49bd121a62e..2ec0b964b304f 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1020,8 +1020,8 @@ void BlockTypeConversionRewrite::commit(RewriterBase &rewriter) {
   // Inform the listener about all IR modifications that have already taken
   // place: References to the original block have been replaced with the new
   // block.
-  if (auto *listener = dyn_cast_or_null<RewriterBase::ForwardingListener>(
-          rewriter.getListener()))
+  if (auto *listener =
+          dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener()))
     for (Operation *op : block->getUsers())
       listener->notifyOperationModified(op);
 
@@ -1123,8 +1123,8 @@ void ReplaceBlockArgRewrite::commit(RewriterBase &rewriter) {
 void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); }
 
 void ReplaceOperationRewrite::commit(RewriterBase &rewriter) {
-  auto *listener = dyn_cast_or_null<RewriterBase::ForwardingListener>(
-      rewriter.getListener());
+  auto *listener =
+      dyn_cast_or_null<RewriterBase::Listener>(rewriter.getListener());
 
   // Compute replacement values.
   SmallVector<Value> replacements =
diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 74776a73db9aa..f227cedb269d8 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 
@@ -711,6 +712,13 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) {
   if (resolvedCall.call->hasTrait<OpTrait::IsTerminator>())
     return false;
 
+  // Don't allow inlining if the target is a self-recursive function.
+  if (llvm::count_if(*resolvedCall.targetNode,
+                     [&](CallGraphNode::Edge const &edge) -> bool {
+                       return edge.getTarget() == resolvedCall.targetNode;
+                     }) > 0)
+    return false;
+
   // Don't allow inlining if the target is an ancestor of the call. This
   // prevents inlining recursively.
   Region *callableRegion = resolvedCall.targetNode->getCallableRegion();
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index 0c51a032df901..f2c490b832076 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -946,3 +946,90 @@ func.func @drop_all_loops(%arg0 : memref<1x1xf32, 3>) -> memref<1x1xf32, 3>
 // CHECK-SLICES-LABEL: func @drop_all_loops
 //       CHECK-SLICES:   memref.subview %{{.*}}[0, 0] [1, 1] [1, 1] : memref<1x1xf32, 3> to memref<f32, strided<[]>, 3>
 //       CHECK-SLICES:   linalg.generic{{.*}}memref<f32, strided<[]>, 3>
+
+// -----
+
+func.func @drop_unit_pad_dims(%arg0: tensor<1x1x3x1x1xf32>) -> tensor<1x2x3x1x3xf32>
+{
+  %c0 = arith.constant 0 : index
+  %cst0 = arith.constant 0.0 : f32
+  %0 = tensor.pad %arg0 low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index):
+      tensor.yield %cst0 : f32
+  } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+  return %0 : tensor<1x2x3x1x3xf32>
+}
+
+// CHECK-LABEL: func @drop_unit_pad_dims
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} : tensor<1x1x3x1x1xf32> into tensor<1x3x1xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0, 0] high[0, 0, 2]
+//       CHECK:   } : tensor<1x3x1xf32> to tensor<2x3x3xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1], [2, 3], [4]{{\]}} : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
+
+// CHECK-SLICES-LABEL: func @drop_unit_pad_dims
+//       CHECK-SLICES:   %[[EXTRACT:.+]] = tensor.extract_slice
+//  CHECK-SLICES-SAME:     [0, 0, 0, 0, 0] [1, 1, 3, 1, 1] [1, 1, 1, 1, 1] : tensor<1x1x3x1x1xf32> to tensor<1x3x1xf32>
+//       CHECK-SLICES:   %[[PADDED:.+]] = tensor.pad %[[EXTRACT]] low[1, 0, 0] high[0, 0, 2]
+//       CHECK-SLICES:   } : tensor<1x3x1xf32> to tensor<2x3x3xf32>
+//       CHECK-SLICES:   tensor.insert_slice %[[PADDED]]
+//  CHECK-SLICES-SAME:     [0, 0, 0, 0, 0] [1, 2, 3, 1, 3] [1, 1, 1, 1, 1] : tensor<2x3x3xf32> into tensor<1x2x3x1x3xf32>
+
+// -----
+
+func.func @drop_unit_pad_dynamic_dims(%arg0: tensor<1x?xf32>) -> tensor<1x?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %cst0 = arith.constant 0.0 : f32
+  %0 = tensor.pad %arg0 low[0, 5] high[0, 6] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %cst0 : f32
+  } : tensor<1x?xf32> to tensor<1x?xf32>
+  return %0 : tensor<1x?xf32>
+}
+
+// CHECK-LABEL: func @drop_unit_pad_dynamic_dims
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<1x?xf32> into tensor<?xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[5] high[6]
+//       CHECK:   } : tensor<?xf32> to tensor<?xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1]{{\]}} : tensor<?xf32> into tensor<1x?xf32>
+
+// CHECK-SLICES: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + 11)>
+
+// CHECK-SLICES-LABEL: func @drop_unit_pad_dynamic_dims
+//  CHECK-SLICES-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<1x?xf32>
+//       CHECK-SLICES:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %c1
+//       CHECK-SLICES:   %[[EXTRACT:.+]] = tensor.extract_slice
+//  CHECK-SLICES-SAME:     [0, 0] [1, %[[DIM]]] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+//       CHECK-SLICES:   %[[PADDED:.+]] = tensor.pad %[[EXTRACT]] low[5] high[6]
+//       CHECK-SLICES:   } : tensor<?xf32> to tensor<?xf32>
+//       CHECK-SLICES:   %[[PADDED_DIM:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]]
+//       CHECK-SLICES:   %[[EMPTY:.+]] = tensor.empty(%[[PADDED_DIM]]) : tensor<1x?xf32>
+//       CHECK-SLICES:   tensor.insert_slice %[[PADDED]] into %[[EMPTY]]
+//  CHECK-SLICES-SAME:     [0, 0] [1, %[[PADDED_DIM]]] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
+
+// -----
+
+func.func @do_not_drop_non_constant_padding(%arg0: tensor<1x1x3x1x1xf32>, %pad: f32) -> tensor<1x2x3x1x3xf32>
+{
+  %c0 = arith.constant 0 : index
+  %0 = tensor.pad %arg0 low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index):
+      %0 = arith.index_cast %arg3 : index to i64
+      %1 = arith.sitofp %0 : i64 to f32
+      %add = arith.addf %pad, %1 : f32
+      tensor.yield %add : f32
+  } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+  return %0 : tensor<1x2x3x1x3xf32>
+}
+
+// CHECK-LABEL: func @do_not_drop_non_constant_padding
+//       CHECK:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
+//       CHECK:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+
+// CHECK-SLICES-LABEL: func @do_not_drop_non_constant_padding
+//       CHECK-SLICES:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
+//       CHECK-SLICES:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index a772a25da5738..506ed1f1c10b1 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -1,5 +1,34 @@
 // RUN: mlir-opt %s -canonicalize="test-convergence" --split-input-file -allow-unregistered-dialect | FileCheck %s
 
+
+// CHECK-LABEL: collapse_shape_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_identity_fold(%arg0 : memref<5xi8>) -> memref<5xi8> {
+  %0 = memref.collapse_shape %arg0 [[0]] : memref<5xi8> into memref<5xi8>
+  return %0 : memref<5xi8>
+}
+
+// -----
+
+// CHECK-LABEL: expand_shape_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_identity_fold(%arg0 : memref<5x4xi8>) -> memref<5x4xi8> {
+  %0 = memref.expand_shape %arg0 [[0], [1]] : memref<5x4xi8> into memref<5x4xi8>
+  return %0 : memref<5x4xi8>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_expand_rank0_cancel
+// CHECK-NEXT: return
+func.func @collapse_expand_rank0_cancel(%arg0 : memref<1x1xi8>) -> memref<1x1xi8> {
+  %0 = memref.collapse_shape %arg0 [] : memref<1x1xi8> into memref<i8>
+  %1 = memref.expand_shape %0 [] : memref<i8> into memref<1x1xi8>
+  return %1 : memref<1x1xi8>
+}
+
+// -----
+
 // CHECK-LABEL: func @subview_of_size_memcast
 //  CHECK-SAME:   %[[ARG0:.[a-z0-9A-Z_]+]]: memref<4x6x16x32xi8>
 //       CHECK:   %[[S:.+]] = memref.subview %[[ARG0]][0, 1, 0, 0] [1, 1, 16, 32] [1, 1, 1, 1] : memref<4x6x16x32xi8> to memref<16x32xi8, strided{{.*}}>
@@ -284,6 +313,59 @@ func.func @dim_of_memref_reshape_i32(%arg0: memref<*xf32>, %arg1: memref<?xi32>)
 
 // -----
 
+// Test case: memref.dim(memref.reshape %v %shp, %idx) -> memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_block_arg_index(
+//  CHECK-SAME:   %[[MEM:[0-9a-z]+]]: memref<*xf32>,
+//  CHECK-SAME:   %[[SHP:[0-9a-z]+]]: memref<?xindex>,
+//  CHECK-SAME:   %[[IDX:[0-9a-z]+]]: index
+//  CHECK-NEXT:   %[[DIM:.*]] = memref.load %[[SHP]][%[[IDX]]]
+//   CHECK-NOT:   memref.dim
+//       CHECK:   return %[[DIM]] : index
+func.func @dim_of_memref_reshape_block_arg_index(%arg0: memref<*xf32>, %arg1: memref<?xindex>, %arg2: index) -> index {
+  %reshape = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+  %dim = memref.dim %reshape, %arg2 : memref<*xf32>
+  return %dim : index
+}
+
+// -----
+
+// Test case: memref.dim(memref.reshape %v %shp, %idx) is not folded into memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_for(
+//       CHECK: memref.reshape
+//       CHECK: memref.dim
+//   CHECK-NOT: memref.load
+func.func @dim_of_memref_reshape_for( %arg0: memref<*xf32>, %arg1: memref<?xindex>) -> index {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+
+    %0 = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %c1) -> (index) {
+      %2 = memref.dim %0, %arg2 : memref<*xf32>
+      %3 = arith.muli %arg3, %2 : index
+      scf.yield %3 : index
+    }
+    return %1 : index
+}
+
+// -----
+
+// Test case: memref.dim(memref.reshape %v %shp, %idx) is not folded into memref.load %shp[%idx]
+// CHECK-LABEL: func @dim_of_memref_reshape_undominated(
+//       CHECK: memref.reshape
+//       CHECK: memref.dim
+//   CHECK-NOT: memref.load
+func.func @dim_of_memref_reshape_undominated(%arg0: memref<*xf32>, %arg1: memref<?xindex>, %arg2: index) -> index {
+    %c4 = arith.constant 4 : index
+    %reshape = memref.reshape %arg0(%arg1) : (memref<*xf32>, memref<?xindex>) -> memref<*xf32>
+    %0 = arith.muli %arg2, %c4 : index
+    %dim = memref.dim %reshape, %0 : memref<*xf32>
+    return %dim : index
+  }
+
+// -----
+
 // CHECK-LABEL: func @alloc_const_fold
 func.func @alloc_const_fold() -> memref<?xf32> {
   // CHECK-NEXT: memref.alloc() : memref<4xf32>
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 8f5ba5ea8fc78..1aef417549d9a 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -415,20 +415,6 @@ func.func @collapse_shape_out_of_bounds(%arg0: memref<?x?xf32>) {
 
 // -----
 
-func.func @expand_shape_invalid_ranks(%arg0: memref<?x?xf32>) {
-  // expected-error @+1 {{op expected rank expansion, but found source rank 2 >= result rank 2}}
-  %0 = memref.expand_shape %arg0 [[0], [1]] : memref<?x?xf32> into memref<?x?xf32>
-}
-
-// -----
-
-func.func @collapse_shape_invalid_ranks(%arg0: memref<?x?xf32>) {
-  // expected-error @+1 {{op expected rank reduction, but found source rank 2 <= result rank 2}}
-  %0 = memref.collapse_shape %arg0 [[0], [1]] : memref<?x?xf32> into memref<?x?xf32>
-}
-
-// -----
-
 func.func @expand_shape_out_of_bounds(%arg0: memref<?xf32>) {
   // expected-error @+1 {{op reassociation index 2 is out of bounds}}
   %0 = memref.expand_shape %arg0 [[0, 1, 2]] : memref<?xf32> into memref<4x?xf32>
@@ -462,6 +448,34 @@ func.func @collapse_shape_invalid_reassociation(%arg0: memref<?x?x?xf32>) {
 
 // -----
 
+// An (invalid) attempt at using collapse_shape to increase the rank might look
+// like this. Verify that a sensible error is emitted in this case.
+func.func @collapse_shape_invalid_reassociation_expansion(%arg0: memref<?xf32>) {
+  // expected-error @+1 {{'memref.collapse_shape' op has source rank 1 and result rank 2. This is not a collapse (1 < 2)}}
+  %0 = memref.collapse_shape %arg0 [[0], [0]] :
+    memref<?xf32> into memref<?x?xf32>
+}
+
+// -----
+
+// An (invalid) attempt at using expand_shape to reduce the rank might look
+// like this. Verify that a sensible error is emitted in this case.
+func.func @expand_shape_invalid_reassociation(%arg0: memref<2x3x1xf32>) {
+  // expected-error @+1 {{'memref.expand_shape' op has source rank 3 and result rank 2. This is not an expansion (3 > 2)}}
+  %0 = memref.expand_shape %arg0 [[0], [1], [1]] :
+    memref<2x3x1xf32> into memref<2x3xf32>
+}
+
+// -----
+
+func.func @collapse_shape_invalid_reassociation_expansion(%arg0: memref<?x?xf32>) {
+  // expected-error @+1 {{reassociation indices must be contiguous}}
+  %0 = memref.collapse_shape %arg0 [[1], [0]] :
+    memref<?x?xf32> into memref<?x?xf32>
+}
+
+// -----
+
 func.func @collapse_shape_reshaping_non_contiguous(
     %arg0: memref<3x4x5xf32, strided<[270, 50, 10], offset: 0>>) {
   // expected-error @+1 {{invalid source layout map or collapsing non-contiguous dims}}
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index d17c23adfb14d..e5374f031be55 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1,5 +1,42 @@
 // RUN: mlir-opt %s -split-input-file -canonicalize="test-convergence" | FileCheck %s
 
+
+// CHECK-LABEL: expand_shape_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_identity_fold(%arg0 : tensor<5xf32>) -> tensor<5xf32> {
+  %0 = tensor.expand_shape %arg0 [[0]] : tensor<5xf32> into tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: expand_shape_rank0_identity_fold
+// CHECK-NEXT: return
+func.func @expand_shape_rank0_identity_fold(%arg0 : tensor<f32>) -> tensor<f32> {
+  %0 = tensor.expand_shape %arg0 [] : tensor<f32> into tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_shape_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_identity_fold(%arg0 : tensor<5x4xf32>) -> tensor<5x4xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0], [1]] : tensor<5x4xf32> into tensor<5x4xf32>
+  return %0 : tensor<5x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: collapse_shape_rank0_identity_fold
+// CHECK-NEXT: return
+func.func @collapse_shape_rank0_identity_fold(%arg0 : tensor<f32>) -> tensor<f32> {
+  %0 = tensor.collapse_shape %arg0 [] : tensor<f32> into tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
 // CHECK-LABEL: @tensor_bitcast_chain_ok
 // CHECK-SAME: %[[IN:.*]]: tensor<2xi32>
 func.func @tensor_bitcast_chain_ok(%input: tensor<2xi32>) -> tensor<2xf32> {
@@ -2092,7 +2129,7 @@ func.func @unpack_pack(%t: tensor<128x128xf32>) -> tensor<128x128xf32> {
 
 // Chain: NC -> NCnc -> NCnc -> NC
 // CHECK: func.func @unpack_pack(
-// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>, 
+// CHECK-SAME: %[[T:.+]]: tensor<128x128xf32>,
 // CHECK: return %[[T]] : tensor<128x128xf32>
 func.func @unpack_pack(%t: tensor<128x128xf32>, %tile1: index, %tile2: index) -> tensor<128x128xf32> {
   %tensor_empty = tensor.empty(%tile1, %tile2) : tensor<16x16x?x?xf32>
@@ -2250,3 +2287,83 @@ func.func @infer_and_fold_pack_unpack_same_tiles(%t: tensor<10x20x4x4xf32>) -> t
 // CHECK-LABEL: func.func @infer_and_fold_pack_unpack_same_tiles
 // CHECK-SAME:    %[[SRC:[0-9a-zA-Z]+]]
 // CHECK:         return %[[SRC]]
+
+// -----
+
+// Test case: Folding of tensor.dim(tensor.reshape %v %shp, %idx) -> tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape(
+//  CHECK-SAME:     %[[MEM:[0-9a-z]+]]: tensor<*xf32>,
+//  CHECK-SAME:     %[[SHP:[0-9a-z]+]]: tensor<?xindex>
+//  CHECK-NEXT:   %[[IDX:.*]] = arith.constant 3
+//  CHECK-NEXT:   %[[DIM:.*]] = tensor.extract %[[SHP]][%[[IDX]]]
+//   CHECK-NOT:   tensor.store
+//   CHECK-NOT:   tensor.dim
+//   CHECK-NOT: tensor.reshape
+//       CHECK:   return %[[DIM]] : index
+func.func @dim_of_reshape(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>)
+    -> index {
+  %c3 = arith.constant 3 : index
+  %0 = tensor.reshape %arg0(%arg1)
+      : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // Update the shape to test that the load ends up in the right place.
+  tensor.insert %c3 into %arg1[%c3] : tensor<?xindex>
+  %1 = tensor.dim %0, %c3 : tensor<*xf32>
+  return %1 : index
+}
+
+// -----
+
+// Test case: Folding of tensor.dim(tensor.reshape %v %shp, %idx) -> tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_i32(
+//       CHECK:  tensor.extract
+//  CHECK-NEXT:  %[[CAST:.*]] = arith.index_cast
+//   CHECK-NOT:  tensor.dim
+//   CHECK-NOT:  tensor.reshape
+//       CHECK:  return %[[CAST]] : index
+func.func @dim_of_reshape_i32(%arg0: tensor<*xf32>, %arg1: tensor<?xi32>)
+    -> index {
+    %c3 = arith.constant 3 : index
+    %0 = tensor.reshape %arg0(%arg1)
+        : (tensor<*xf32>, tensor<?xi32>) -> tensor<*xf32>
+    %1 = tensor.dim %0, %c3 : tensor<*xf32>
+    return %1 : index
+}
+
+// -----
+
+// Test case: tensor.dim(tensor.reshape %v %shp, %idx) is folded into tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_for(
+//       CHECK: scf.for
+//  CHECK-NEXT: tensor.extract
+//   CHECK-NOT: tensor.dim
+//   CHECK-NOT: tensor.reshape
+func.func @dim_of_reshape_for( %arg0: tensor<*xf32>, %arg1: tensor<?xindex>) -> index {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c4 = arith.constant 4 : index
+
+    %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+    %1 = scf.for %arg2 = %c0 to %c4 step %c1 iter_args(%arg3 = %c1) -> (index) {
+      %2 = tensor.dim %0, %arg2 : tensor<*xf32>
+      %3 = arith.muli %arg3, %2 : index
+      scf.yield %3 : index
+    }
+    return %1 : index
+}
+
+// -----
+
+// Test case: tensor.dim(tensor.reshape %v %shp, %idx) is folded into tensor.extract %shp[%idx]
+// CHECK-LABEL: func @dim_of_reshape_undominated(
+//       CHECK: arith.muli
+//  CHECK-NEXT: tensor.extract
+//   CHECK-NOT: tensor.dim
+//   CHECK-NOT: tensor.reshape
+func.func @dim_of_reshape_undominated(%arg0: tensor<*xf32>, %arg1: tensor<?xindex>, %arg2: index) -> index {
+    %c4 = arith.constant 4 : index
+    %reshape = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+    %0 = arith.muli %arg2, %c4 : index
+    %dim = tensor.dim %reshape, %0 : tensor<*xf32>
+    return %dim : index
+  }
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
index 4c534fe936e3d..79ca0de68a1e9 100644
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -343,20 +343,6 @@ func.func @illegal_collapsing_reshape_mixed_tensor_2(%arg0 : tensor<?x4x5xf32>)
 
 // -----
 
-func.func @expand_shape_invalid_ranks(%arg0: tensor<?x?xf32>) {
-  // expected-error @+1 {{op expected rank expansion, but found source rank 2 >= result rank 2}}
-  %0 = tensor.expand_shape %arg0 [[0], [1]] : tensor<?x?xf32> into tensor<?x?xf32>
-}
-
-// -----
-
-func.func @collapse_shape_invalid_ranks(%arg0: tensor<?x?xf32>) {
-  // expected-error @+1 {{op expected rank reduction, but found source rank 2 <= result rank 2}}
-  %0 = tensor.collapse_shape %arg0 [[0], [1]] : tensor<?x?xf32> into tensor<?x?xf32>
-}
-
-// -----
-
 func.func @rank(%0: f32) {
   // expected-error@+1 {{'tensor.rank' op operand #0 must be tensor of any type values}}
   "tensor.rank"(%0): (f32)->index
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
index 11edd854ec08a..9c9b0e3330c9c 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_0_permute.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -99,20 +99,6 @@ module {
     return
   }
 
-  func.func @dump_mat_perm_9x4(%A: tensor<9x4xf64, #MAT_C_C_P>) {
-    %c = sparse_tensor.convert %A : tensor<9x4xf64, #MAT_C_C_P> to tensor<9x4xf64>
-    %cu = tensor.cast %c : tensor<9x4xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<9x4xf64, #MAT_C_C_P>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_C_C_P> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_9x4(%A: tensor<9x4xf64>) {
     %u = tensor.cast %A : tensor<9x4xf64> to tensor<*xf64>
     call @printMemrefF64(%u) : (tensor<*xf64>) -> ()
@@ -120,18 +106,8 @@ module {
     return
   }
 
-  func.func @dump_mat_annotated_dense_9x4(%A: tensor<9x4xf64, #MAT_D_D>) {
-    %n = sparse_tensor.number_of_entries %A : tensor<9x4xf64, #MAT_D_D>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<9x4xf64, #MAT_D_D> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -163,20 +139,21 @@ module {
     %sm34cdp = sparse_tensor.convert %m34 : tensor<3x4xf64> to tensor<3x4xf64, #MAT_C_D_P>
     %sm44dcp = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C_P>
 
-    // CHECK:      {{\[}}[1,   0,   3,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0],
-    // CHECK-NEXT:  [1,   0,   1,   1],
-    // CHECK-NEXT:  [0,   0.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   1.5,   1],
-    // CHECK-NEXT:  [0,   3.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1,  1,  2,  0.5,  5,  3.5,  5,  0.5,  3,  1,  2,  1.5,  2,  1,  1
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 9, 4 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 11, 16, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 1, 3, 4, 6, 7, 8, 0, 2, 4, 5, 7, 2, 5
+    // CHECK-NEXT: values : ( 1, 1, 1, 1, 1, 2, 0.5, 5, 3.5, 5, 0.5, 3, 1, 2, 1.5, 2, 1, 1
+    // CHECK-NEXT: ----
+    //
     %4 = call @concat_sparse_sparse_perm(%sm24ccp, %sm34cd, %sm44dc)
                : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C_P>
-    call @dump_mat_perm_9x4(%4) : (tensor<9x4xf64, #MAT_C_C_P>) -> ()
+    sparse_tensor.print %4 : tensor<9x4xf64, #MAT_C_C_P>
 
     // CHECK:      {{\[}}[1,   0,   3,   0],
     // CHECK-NEXT:  [0,   2,   0,   0],
@@ -191,20 +168,21 @@ module {
                : (tensor<2x4xf64, #MAT_C_C_P>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64>
     call @dump_mat_dense_9x4(%5) : (tensor<9x4xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   3,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0],
-    // CHECK-NEXT:  [1,   0,   1,   1],
-    // CHECK-NEXT:  [0,   0.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   1.5,   1],
-    // CHECK-NEXT:  [0,   3.5,   0,   0],
-    // CHECK-NEXT:  [1,   5,   2,   0],
-    // CHECK-NEXT:  [1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  3,  2,  1,  1,  1,  0.5,  1,  5,  2,  1.5,  1,  3.5,  1,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 9, 4 )
+    // CHECK-NEXT: lvl = ( 9, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 9
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 13, 16, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 1, 0, 2, 3, 1, 0, 1, 2, 2, 3, 1, 0, 1, 2, 0, 1
+    // CHECK-NEXT: values : ( 1, 3, 2, 1, 1, 1, 0.5, 1, 5, 2, 1.5, 1, 3.5, 1, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %6 = call @concat_mix_sparse_perm(%m24, %sm34cdp, %sm44dc)
                : (tensor<2x4xf64>, tensor<3x4xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<9x4xf64, #MAT_C_C>
-    call @dump_mat_9x4(%6) : (tensor<9x4xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %6 : tensor<9x4xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   3,   0],
     // CHECK-NEXT:  [0,   2,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
index 48d3825700920..ae067bf18527b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -82,20 +82,6 @@ module {
     return %0 : tensor<4x9xf64>
   }
 
-  func.func @dump_mat_4x9(%A: tensor<4x9xf64, #MAT_C_C>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_4x9(%A: tensor<4x9xf64>) {
     %1 = tensor.cast %A : tensor<4x9xf64> to tensor<*xf64>
     call @printMemrefF64(%1) : (tensor<*xf64>) -> ()
@@ -104,7 +90,7 @@ module {
   }
 
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -125,15 +111,21 @@ module {
     %sm43cd = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D>
     %sm44dc = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C>
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %8 = call @concat_sparse_sparse_dim1(%sm42cc, %sm43cd, %sm44dc)
                : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%8) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %8 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
@@ -143,15 +135,21 @@ module {
                : (tensor<4x2xf64, #MAT_C_C>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
     call @dump_mat_dense_4x9(%9) : (tensor<4x9xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %10 = call @concat_mix_sparse_dim1(%m42, %sm43cd, %sm44dc)
                : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%10) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %10 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
index dcdaa072c02fd..ce746f27c4d88 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/concatenate_dim_1_permute.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 
@@ -85,34 +85,6 @@ module {
     return %0 : tensor<4x9xf64>
   }
 
-  func.func @dump_mat_4x9(%A: tensor<4x9xf64, #MAT_C_C>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
-  func.func @dump_mat_perm_4x9(%A: tensor<4x9xf64, #MAT_C_C_P>) {
-    %c = sparse_tensor.convert %A : tensor<4x9xf64, #MAT_C_C_P> to tensor<4x9xf64>
-    %cu = tensor.cast %c : tensor<4x9xf64> to tensor<*xf64>
-    call @printMemrefF64(%cu) : (tensor<*xf64>) -> ()
-
-    %n = sparse_tensor.number_of_entries %A : tensor<4x9xf64, #MAT_C_C_P>
-    vector.print %n : index
-
-    %1 = sparse_tensor.values %A : tensor<4x9xf64, #MAT_C_C_P> to memref<?xf64>
-    call @printMemref1dF64(%1) : (memref<?xf64>) -> ()
-
-    return
-  }
-
   func.func @dump_mat_dense_4x9(%A: tensor<4x9xf64>) {
     %1 = tensor.cast %A : tensor<4x9xf64> to tensor<*xf64>
     call @printMemrefF64(%1) : (tensor<*xf64>) -> ()
@@ -121,7 +93,7 @@ module {
   }
 
   // Driver method to call and verify kernels.
-  func.func @entry() {
+  func.func @main() {
     %m42 = arith.constant dense<
       [ [ 1.0, 0.0 ],
         [ 3.1, 0.0 ],
@@ -153,15 +125,21 @@ module {
     %sm43cdp = sparse_tensor.convert %m43 : tensor<4x3xf64> to tensor<4x3xf64, #MAT_C_D_P>
     %sm44dcp = sparse_tensor.convert %m44 : tensor<4x4xf64> to tensor<4x4xf64, #MAT_D_C_P>
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  3.1,  2,  1,  1,  5,  2,  1,  0.5,  1,  1,  1,  3.5,  5,  0.5,  1.5,  2,  1
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 9, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 9
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 6, 7, 10, 12, 15, 17, 18
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 0, 1, 3, 3, 0, 1, 2, 2, 3, 1, 2, 3, 0, 2, 0
+    // CHECK-NEXT: values : ( 1, 3.1, 2, 1, 1, 5, 2, 1, 0.5, 1, 1, 1, 3.5, 5, 0.5, 1.5, 2, 1
+    // CHECK-NEXT: ----
+    //
     %12 = call @concat_sparse_sparse_perm_dim1(%sm42ccp, %sm43cd, %sm44dc)
                : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C_P>
-    call @dump_mat_perm_4x9(%12) : (tensor<4x9xf64, #MAT_C_C_P>) -> ()
+    sparse_tensor.print %12 : tensor<4x9xf64, #MAT_C_C_P>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
@@ -171,15 +149,21 @@ module {
                : (tensor<4x2xf64, #MAT_C_C_P>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64>
     call @dump_mat_dense_4x9(%13) : (tensor<4x9xf64>) -> ()
 
-    // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
-    // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
-    // CHECK-NEXT:  [0,   2,   0,   0,   1,   1,   5,   2,   0],
-    // CHECK-NEXT:  [0,   0,   5,   2,   0,   1,   0.5,   0,   0]]
-    // CHECK-NEXT: 18
-    // CHECK:      [1,  1,  1,  1.5,  1,  3.1,  1,  0.5,  3.5,  2,  1,  1,  5,  2,  5,  2,  1,  0.5
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 18
+    // CHECK-NEXT: dim = ( 4, 9 )
+    // CHECK-NEXT: lvl = ( 4, 9 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 5, 9, 14, 18
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 7, 8, 0, 2, 4, 6, 1, 4, 5, 6, 7, 2, 3, 5, 6
+    // CHECK-NEXT: values : ( 1, 1, 1, 1.5, 1, 3.1, 1, 0.5, 3.5, 2, 1, 1, 5, 2, 5, 2, 1, 0.5
+    // CHECK-NEXT: ----
+    //
     %14 = call @concat_mix_sparse_perm_dim1(%m42, %sm43cdp, %sm44dc)
                : (tensor<4x2xf64>, tensor<4x3xf64, #MAT_C_D_P>, tensor<4x4xf64, #MAT_D_C>) -> tensor<4x9xf64, #MAT_C_C>
-    call @dump_mat_4x9(%14) : (tensor<4x9xf64, #MAT_C_C>) -> ()
+    sparse_tensor.print %14 : tensor<4x9xf64, #MAT_C_C>
 
     // CHECK:      {{\[}}[1,   0,   1,   0,   1,   0,   0,   1.5,   1],
     // CHECK-NEXT:  [3.1,   0,   1,   0,   0.5,   0,   3.5,   0,   0],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
index 6c35e2b51ed8f..350b5b41dafc0 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/dual_sparse_conv_2d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -85,7 +85,7 @@ module {
     return %0 : tensor<6x6xi32, #CSC>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
@@ -141,7 +141,6 @@ module {
        : (tensor<8x8xi32, #CSC>,
           tensor<3x3xi32, #CSC>) -> tensor<6x6xi32, #CSC>
 
-
     // Verify the output.
     //
     // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
@@ -156,64 +155,62 @@ module {
     vector.print %v : vector<6x6xi32>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_DCSR = sparse_tensor.convert %2
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v2 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CD = sparse_tensor.convert %4
-      : tensor<6x6xi32, #CDR> to tensor<6x6xi32>
-    %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v4 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSR = sparse_tensor.convert %3
-      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
-    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v3 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSC = sparse_tensor.convert %5
-      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
-    %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v5 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
 
     // Release the resources.
     bufferization.dealloc_tensor %sparse_input_DCSR : tensor<8x8xi32, #DCSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
index 689428c23f7d7..ebf9f4392d859 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/reshape_dot.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -84,7 +84,7 @@ module {
   }
 
 
-  func.func @entry() {
+  func.func @main() {
     // Setup two sparse vectors.
     %d1 = arith.constant sparse<
         [ [0, 0], [1, 1], [2, 2], [2, 3], [4, 5] ],
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
index 024e86b4f165b..2ff73923c8327 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -90,28 +90,38 @@ module {
     // ending at index (3,3,2)) with a “DCSR-flavored” along (j,k) with
     // dense “fibers” in the i-dim, we end up with 8 stored entries.
     //
-    // CHECK: 8
-    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: dim = ( 4, 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 2
+    // CHECK-NEXT: crd[0] : ( 0, 3
+    // CHECK-NEXT: pos[1] : ( 0, 1, 2
+    // CHECK-NEXT: crd[1] : ( 0, 2
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
     //
-    %na = sparse_tensor.number_of_entries %a : tensor<4x4x4xi32, #Sparse1>
-    vector.print %na : index
-    %ma = sparse_tensor.values %a: tensor<4x4x4xi32, #Sparse1> to memref<?xi32>
-    %va = vector.transfer_read %ma[%c0], %i0: memref<?xi32>, vector<8xi32>
-    vector.print %va : vector<8xi32>
+    sparse_tensor.print %a : tensor<4x4x4xi32, #Sparse1>
 
     //
     // If we store full 2x2x2 3-D blocks in the original index order
     // in a compressed fashion, we end up with 4 blocks to incorporate
     // all the nonzeros, and thus 32 stored entries.
     //
-    // CHECK: 32
-    // CHECK-NEXT: ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0 )
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 32
+    // CHECK-NEXT: dim = ( 4, 4, 4 )
+    // CHECK-NEXT: lvl = ( 2, 2, 2, 2, 2, 2 )
+    // CHECK-NEXT: pos[0] : ( 0, 2
+    // CHECK-NEXT: crd[0] : ( 0, 1
+    // CHECK-NEXT: pos[1] : ( 0, 2, 4
+    // CHECK-NEXT: crd[1] : ( 0, 1, 0, 1
+    // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4
+    // CHECK-NEXT: crd[2] : ( 0, 1, 0, 1
+    // CHECK-NEXT: values : ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0
+    // CHECK-NEXT: ----
     //
-    %nb = sparse_tensor.number_of_entries %b : tensor<4x4x4xi32, #Sparse2>
-    vector.print %nb : index
-    %mb = sparse_tensor.values %b: tensor<4x4x4xi32, #Sparse2> to memref<?xi32>
-    %vb = vector.transfer_read %mb[%c0], %i0: memref<?xi32>, vector<32xi32>
-    vector.print %vb : vector<32xi32>
+    sparse_tensor.print %b : tensor<4x4x4xi32, #Sparse2>
 
     // Release the resources.
     bufferization.dealloc_tensor %a : tensor<4x4x4xi32, #Sparse1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
index 6efe7b334b984..3b5168db23c58 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cast.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -178,7 +178,7 @@ module {
   // Main driver that converts a dense tensor into a sparse tensor
   // and then calls the sparse casting kernel.
   //
-  func.func @entry() {
+  func.func @main() {
     %z = arith.constant 0 : index
     %b = arith.constant 0 : i8
     %i = arith.constant 0 : i32
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
index 035db33fb4b31..732bde55be91f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_cmp.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -96,7 +96,7 @@ module {
   // Main driver that constructs matrix and calls the sparse kernel to perform
   // element-wise comparison.
   //
-  func.func @entry() {
+  func.func @main() {
     %d0 = arith.constant 0 : i8
     %c0 = arith.constant 0 : index
 
@@ -124,33 +124,44 @@ module {
             : (tensor<4x4xf64, #DCSR>, tensor<4x4xf64, #DCSR>) -> tensor<4x4xi8, #DCSR>
 
     //
-    // All should have the same result.
+    // All should have the same boolean values.
+    //
+    // CHECK: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 16
+    // CHECK-NEXT: dim = ( 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 4, 8, 12, 16
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    // CHECK-NEXT: values : ( 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 11
+    // CHECK-NEXT: dim = ( 4, 4 )
+    // CHECK-NEXT: lvl = ( 4, 4 )
+    // CHECK-NEXT: pos[0] : ( 0, 4
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3
+    // CHECK-NEXT: pos[1] : ( 0, 3, 5, 9, 11
+    // CHECK-NEXT: crd[1] : ( 1, 2, 3, 0, 1, 0, 1, 2, 3, 0, 1
+    // CHECK-NEXT: values : ( 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0
+    // CHECK-NEXT: ----
     //
-    // CHECK-COUNT-3: ( ( 0, 1, 0, 1 ), ( 1, 0, 0, 0 ), ( 1, 0, 0, 1 ), ( 0, 0, 0, 0 ) )
     %v = vector.transfer_read %all_dn_out[%c0, %c0], %d0
        : tensor<4x4xi8>, vector<4x4xi8>
     vector.print %v : vector<4x4xi8>
-
-    %lhs_sp_ret = sparse_tensor.convert %lhs_sp_out
-      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
-    %v1 = vector.transfer_read %lhs_sp_ret[%c0, %c0], %d0
-      : tensor<4x4xi8>, vector<4x4xi8>
-    vector.print %v1 : vector<4x4xi8>
-
-    %rhs_sp_ret = sparse_tensor.convert %all_sp_out
-      : tensor<4x4xi8, #DCSR> to tensor<4x4xi8>
-    %v2 = vector.transfer_read %rhs_sp_ret[%c0, %c0], %d0
-      : tensor<4x4xi8>, vector<4x4xi8>
-    vector.print %v2 : vector<4x4xi8>
-
+    sparse_tensor.print %lhs_sp_out : tensor<4x4xi8, #DCSR>
+    sparse_tensor.print %all_sp_out : tensor<4x4xi8, #DCSR>
 
     bufferization.dealloc_tensor %lhs_sp : tensor<4x4xf64, #DCSR>
     bufferization.dealloc_tensor %rhs_sp : tensor<4x4xf64, #DCSR>
     bufferization.dealloc_tensor %all_dn_out : tensor<4x4xi8>
     bufferization.dealloc_tensor %lhs_sp_out : tensor<4x4xi8, #DCSR>
     bufferization.dealloc_tensor %all_sp_out : tensor<4x4xi8, #DCSR>
-    bufferization.dealloc_tensor %lhs_sp_ret : tensor<4x4xi8>
-    bufferization.dealloc_tensor %rhs_sp_ret : tensor<4x4xi8>
+
     return
   }
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
index 7925759714edd..c5d002aa16391 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_dim.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -38,7 +38,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
index 002a79055ce55..9deb5cd05fa3b 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_codegen_foreach.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -144,7 +144,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     //
     // Initialize a 3-dim dense tensor.
     //
@@ -166,6 +166,7 @@ module {
     %s4 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOO>
     %s5 = sparse_tensor.convert %src : tensor<2x2xf64> to tensor<2x2xf64, #SortedCOOPerm>
     %s6 = sparse_tensor.convert %src3d : tensor<7x8x9xf64>  to tensor<7x8x9xf64, #CCCPerm>
+
     // CHECK: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -173,6 +174,7 @@ module {
     // CHECK-NEXT: 6
     // CHECK-NEXT: 5
     call @foreach_print_const() : () -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -186,6 +188,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_dense(%src) : (tensor<2x2xf64>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -199,6 +202,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_1(%s1) : (tensor<2x2xf64, #Row>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -212,6 +216,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_2(%s2) : (tensor<2x2xf64, #CSR>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -225,6 +230,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_3(%s3) : (tensor<2x2xf64, #DCSC>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
@@ -238,6 +244,7 @@ module {
     // CHECK-NEXT: 1
     // CHECK-NEXT: 6
     call @foreach_print_4(%s4) : (tensor<2x2xf64, #SortedCOO>) -> ()
+
     // CHECK-NEXT: 0
     // CHECK-NEXT: 0
     // CHECK-NEXT: 1
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
index 2b5155464f0ee..cae599fa30ae2 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_collapse_shape.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -115,7 +115,7 @@ module {
   //
   // Main driver.
   //
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %df = arith.constant -1.0 : f64
 
@@ -157,69 +157,95 @@ module {
     //
     // CHECK:      ( 1.1, 0, 1.3, 0, 2.1, 0, 2.3, 0, 3.1, 0, 3.3, 0 )
     // CHECK-NEXT: ( 1.1, 0, 1.3, 0, 2.1, 0, 2.3, 0, 3.1, 0, 3.3, 0 )
-    // CHECK-NEXT: ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
-    // CHECK-NEXT: ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME:   ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ),
-    // CHECK-SAME:   ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
-    // CHECK-NEXT: ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
-
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 6
+    // CHECK-NEXT: dim = ( 12 )
+    // CHECK-NEXT: lvl = ( 12 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 6
+    // CHECK-NEXT: dim = ( 12 )
+    // CHECK-NEXT: lvl = ( 12 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4, 6, 8, 10
+    // CHECK-NEXT: values : ( 1.1, 1.3, 2.1, 2.3, 3.1, 3.3
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    // CHECK-NEXT: ( ( 1, 0, 3, 0, 5, 0, 7, 0, 9, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 21, 0, 23, 0, 25, 0, 27, 0, 29, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ), ( 41, 0, 43, 0, 45, 0, 47, 0, 49, 0 ), ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) )
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 15
+    // CHECK-NEXT: dim = ( 6, 10 )
+    // CHECK-NEXT: lvl = ( 6, 10 )
+    // CHECK-NEXT: pos[0] : ( 0, 3
+    // CHECK-NEXT: crd[0] : ( 0, 2, 4
+    // CHECK-NEXT: pos[1] : ( 0, 5, 10, 15
+    // CHECK-NEXT: crd[1] : ( 0, 2, 4, 6, 8, 0, 2, 4, 6, 8, 0, 2, 4, 6, 8
+    // CHECK-NEXT: values : ( 1, 3, 5, 7, 9, 21, 23, 25, 27, 29, 41, 43, 45, 47, 49
+    // CHECK-NEXT: ----
+    //
     %v0 = vector.transfer_read %collapse0[%c0], %df: tensor<12xf64>, vector<12xf64>
     vector.print %v0 : vector<12xf64>
     %v1 = vector.transfer_read %collapse1[%c0], %df: tensor<12xf64>, vector<12xf64>
     vector.print %v1 : vector<12xf64>
-    %b2 = sparse_tensor.values %collapse2 : tensor<12xf64, #SparseVector> to memref<?xf64>
-    %v2 = vector.transfer_read %b2[%c0], %df: memref<?xf64>, vector<12xf64>
-    vector.print %v2 : vector<12xf64>
-    %b3 = sparse_tensor.values %collapse3 : tensor<12xf64, #SparseVector> to memref<?xf64>
-    %v3 = vector.transfer_read %b3[%c0], %df: memref<?xf64>, vector<12xf64>
-    vector.print %v3 : vector<12xf64>
+    sparse_tensor.print %collapse2 : tensor<12xf64, #SparseVector>
+    sparse_tensor.print %collapse3 : tensor<12xf64, #SparseVector>
 
     %v4 = vector.transfer_read %collapse4[%c0, %c0], %df: tensor<6x10xf64>, vector<6x10xf64>
     vector.print %v4 : vector<6x10xf64>
     %v5 = vector.transfer_read %collapse5[%c0, %c0], %df: tensor<6x10xf64>, vector<6x10xf64>
     vector.print %v5 : vector<6x10xf64>
-    %b6 = sparse_tensor.values %collapse6 : tensor<6x10xf64, #SparseMatrix> to memref<?xf64>
-    %v6 = vector.transfer_read %b6[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v6 : vector<60xf64>
-    %b7 = sparse_tensor.values %collapse7 : tensor<6x10xf64, #SparseMatrix> to memref<?xf64>
-    %v7 = vector.transfer_read %b7[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v7 : vector<60xf64>
+    sparse_tensor.print %collapse6 : tensor<6x10xf64, #SparseMatrix>
+    sparse_tensor.print %collapse7 : tensor<6x10xf64, #SparseMatrix>
 
     %v8 = vector.transfer_read %collapse8[%c0, %c0], %df: tensor<?x?xf64>, vector<6x10xf64>
     vector.print %v8 : vector<6x10xf64>
     %v9 = vector.transfer_read %collapse9[%c0, %c0], %df: tensor<?x?xf64>, vector<6x10xf64>
     vector.print %v9 : vector<6x10xf64>
-    %b10 = sparse_tensor.values %collapse10 : tensor<?x?xf64, #SparseMatrix> to memref<?xf64>
-    %v10 = vector.transfer_read %b10[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v10 : vector<60xf64>
-    %b11 = sparse_tensor.values %collapse11 : tensor<?x?xf64, #SparseMatrix> to memref<?xf64>
-    %v11 = vector.transfer_read %b11[%c0], %df: memref<?xf64>, vector<60xf64>
-    vector.print %v11 : vector<60xf64>
+    sparse_tensor.print %collapse10 : tensor<?x?xf64, #SparseMatrix>
+    sparse_tensor.print %collapse11 : tensor<?x?xf64, #SparseMatrix>
 
     // Release sparse resources.
     bufferization.dealloc_tensor %sm : tensor<3x4xf64, #SparseMatrix>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
index b5efdcc09a390..abdbf80d0bc41 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_constant_to_sparse_tensor.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -38,7 +38,7 @@
 // Integration tests for conversions from sparse constants to sparse tensors.
 //
 module {
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
@@ -51,20 +51,19 @@ module {
     // Convert the tensor in COO format to a sparse tensor with annotation #Tensor1.
     %ts = sparse_tensor.convert %ti : tensor<10x8xf64> to tensor<10x8xf64, #Tensor1>
 
-    // CHECK: ( 0, 1, 4, 5, 6, 9 )
-    %i0 = sparse_tensor.coordinates %ts { level = 0 : index } : tensor<10x8xf64, #Tensor1> to memref<?xindex>
-    %i0r = vector.transfer_read %i0[%c0], %c0: memref<?xindex>, vector<6xindex>
-    vector.print %i0r : vector<6xindex>
-
-    // CHECK: ( 0, 7, 2, 2, 3, 4, 6, 7 )
-    %i1 = sparse_tensor.coordinates %ts { level = 1 : index } : tensor<10x8xf64, #Tensor1> to memref<?xindex>
-    %i1r = vector.transfer_read %i1[%c0], %c0: memref<?xindex>, vector<8xindex>
-    vector.print %i1r : vector<8xindex>
-
-    // CHECK: ( 1, 2, 3, 4, 5, 6, 7, 8 )
-    %v = sparse_tensor.values %ts : tensor<10x8xf64, #Tensor1> to memref<?xf64>
-    %vr = vector.transfer_read %v[%c0], %d0: memref<?xf64>, vector<8xf64>
-    vector.print %vr : vector<8xf64>
+    //
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 8
+    // CHECK-NEXT: dim = ( 10, 8 )
+    // CHECK-NEXT: lvl = ( 10, 8 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 4, 5, 6, 9
+    // CHECK-NEXT: pos[1] : ( 0, 2, 3, 4, 5, 7, 8
+    // CHECK-NEXT: crd[1] : ( 0, 7, 2, 2, 3, 4, 6, 7
+    // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, 6, 7, 8
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %ts : tensor<10x8xf64, #Tensor1>
 
     // Release the resources.
     bufferization.dealloc_tensor %ts : tensor<10x8xf64, #Tensor1>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
index 16a67a1458369..612e62bd34d28 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -79,7 +79,7 @@ func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor<?x?x?xf32, #CDC>, %arg1: tensor<?x?
   return %ret : tensor<?x?x?xf32, #CDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -111,23 +111,35 @@ func.func @entry() {
       : tensor<?x?x?xf32>, vector<3x6x1xf32>
   vector.print %dense_v : vector<3x6x1xf32>
 
-  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
-  %1 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<3x6x1xf32>
-  vector.print %v1 : vector<3x6x1xf32>
-
-  //      CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ),
-  // CHECK-SAME:   ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ) )
-  %2 = sparse_tensor.convert %CDC_ret
-    : tensor<?x?x?xf32, #CDC> to tensor<?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<3x6x1xf32>
-  vector.print %v2 : vector<3x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 18
+  // CHECK-NEXT: dim = ( 3, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 18
+  // CHECK-NEXT: dim = ( 3, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[2] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+  // CHECK-NEXT: crd[2] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 12, 28, 28, 28, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
 
   // Free the resources
   bufferization.dealloc_tensor %in1D_nwc : tensor<?x?x?xf32>
@@ -140,8 +152,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CCC_ret : tensor<?x?x?xf32, #CCC>
   bufferization.dealloc_tensor %CDC_ret : tensor<?x?x?xf32, #CDC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
index 41071ea700fb6..f8fb8fdf53e35 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -113,7 +113,7 @@ module {
     return %0 : tensor<6x6xi32, #CSC>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
@@ -181,82 +181,81 @@ module {
     vector.print %v : vector<6x6xi32>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %sparse_ret = sparse_tensor.convert %1
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v1 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %1 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_DCSR = sparse_tensor.convert %2
-      : tensor<6x6xi32, #DCSR> to tensor<6x6xi32>
-    %v2 = vector.transfer_read %all_sparse_DCSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v2 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %2 : tensor<6x6xi32, #DCSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CD = sparse_tensor.convert %4
-      : tensor<6x6xi32, #CDR> to tensor<6x6xi32>
-    %v4 = vector.transfer_read %all_sparse_CD[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v4 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %3 : tensor<6x6xi32, #CSR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSR = sparse_tensor.convert %3
-      : tensor<6x6xi32, #CSR> to tensor<6x6xi32>
-    %v3 = vector.transfer_read %all_sparse_CSR[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v3 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[0] : ( 0, 6
+    // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, 0, -1, -6, -1, 6, -1, 0, 1, 0, 1, 0, 0, -1, 1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 3, 6, -3, -6, 2, -1, 3, 0, -3, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %4 : tensor<6x6xi32, #CDR>
 
     //
-    // Should be the same as dense output
-    // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
-    // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
-    // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
-    // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ),
-    // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ),
-    // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) )
+    // Should be the same as dense output.
     //
-    %all_sparse_CSC = sparse_tensor.convert %5
-      : tensor<6x6xi32, #CSC> to tensor<6x6xi32>
-    %v5 = vector.transfer_read %all_sparse_CSC[%c0, %c0], %i0
-      : tensor<6x6xi32>, vector<6x6xi32>
-    vector.print %v5 : vector<6x6xi32>
+    // CHECK:      ---- Sparse Tensor ----
+    // CHECK-NEXT: nse = 36
+    // CHECK-NEXT: dim = ( 6, 6 )
+    // CHECK-NEXT: lvl = ( 6, 6 )
+    // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+    // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+    // CHECK-NEXT: values : ( 0, -1, 0, -1, 0, 2, 0, 0, -1, 0, 0, -1, -1, 1, 1, 0, 3, 3, -6, 0, 0, 0, 6, 0, -1, 1, 0, 0, -3, -3, 6, 0, 0, 0, -6, 0
+    // CHECK-NEXT: ----
+    //
+    sparse_tensor.print %5 : tensor<6x6xi32, #CSC>
 
     //
-    // Should be the same as dense output
+    // Should be the same as dense output.
     // CHECK:    ( ( 0, 0, -1, -6, -1, 6 ),
     // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ),
     // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ),
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
index a7d7d1c5ed3c3..00805d198013d 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_55.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -68,7 +68,7 @@ module {
     return %0 : tensor<6x6xi32>
   }
 
-  func.func @entry() {
+  func.func @main() {
     %c0 = arith.constant 0 : index
     %i0 = arith.constant 0 : i32
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
index 95ce4f1bf48d5..9150e97e72481 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -82,7 +82,7 @@ func.func @conv_2d_nchw_fchw_CCCC_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1:
   return %ret : tensor<?x?x?x?xf32>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
index d0fbce7146fe5..d04311e59bafa 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -93,7 +93,7 @@ func.func @conv_2d_nhwc_hwcf_DCCD(%arg0: tensor<?x?x?x?xf32, #DCCD>, %arg1: tens
   return %ret : tensor<?x?x?x?xf32, #DCCD>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -142,77 +142,93 @@ func.func @entry() {
       : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
   vector.print %dense_v : vector<3x6x6x1xf32>
 
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %1 = sparse_tensor.convert %CCCC_ret
-    : tensor<?x?x?x?xf32, #CCCC> to tensor<?x?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v1 : vector<3x6x6x1xf32>
-
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %2 = sparse_tensor.convert %CDCD_ret
-    : tensor<?x?x?x?xf32, #CDCD> to tensor<?x?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v2 : vector<3x6x6x1xf32>
-
-  // CHECK:     ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:  ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:    ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) )
-  %3 = sparse_tensor.convert %DCCD_ret
-    : tensor<?x?x?x?xf32, #DCCD> to tensor<?x?x?x?xf32>
-  %v3 = vector.transfer_read %3[%c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?xf32>, vector<3x6x6x1xf32>
-  vector.print %v3 : vector<3x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[3] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+  // CHECK-SAME:            21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+  // CHECK-SAME:            40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+  // CHECK-SAME:            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+  // CHECK-SAME:            78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
+  // CHECK-SAME:            97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108
+  // CHECK-NEXT: crd[3] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCCC_ret : tensor<?x?x?x?xf32, #CCCC>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 3
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDCD_ret : tensor<?x?x?x?xf32, #CDCD>
+
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 108
+  // CHECK-NEXT: dim = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 3, 6, 6, 1 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DCCD_ret : tensor<?x?x?x?xf32, #DCCD>
 
   // Free the resources
   bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
@@ -227,9 +243,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CDCD_ret : tensor<?x?x?x?xf32, #CDCD>
   bufferization.dealloc_tensor %DCCD_ret : tensor<?x?x?x?xf32, #DCCD>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?x?xf32>
-  bufferization.dealloc_tensor %3 : tensor<?x?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
index f0a26dc46b056..5e2d1707a2495 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -96,7 +96,7 @@ func.func @conv_3d_DDC(%arg0: tensor<?x?x?xf32, #DDC>, %arg1: tensor<?x?x?xf32>)
   return %ret : tensor<?x?x?xf32, #DDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -166,173 +166,180 @@ func.func @entry() {
       : tensor<?x?x?xf32>, vector<6x6x6xf32>
   vector.print %dense_v : vector<6x6x6xf32>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %1 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v1 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[0] : ( 0, 6
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78,
+  // CHECK-SAME:            84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150,
+  // CHECK-SAME:            156, 162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
+  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0,
+  // CHECK-SAME:            1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+  // CHECK-SAME:            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4,
+  // CHECK-SAME:            5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCC_ret : tensor<?x?x?xf32, #CCC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %2 = sparse_tensor.convert %CCC_ret
-    : tensor<?x?x?xf32, #CCC> to tensor<?x?x?xf32>
-  %v2 = vector.transfer_read %2[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[0] : ( 0, 6
+  // CHECK-NEXT: crd[0] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84,
+  // CHECK-SAME:            90, 96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156,
+  // CHECK-SAME:            162, 168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDC_ret : tensor<?x?x?xf32, #CDC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %3 = sparse_tensor.convert %DDC_ret
-    : tensor<?x?x?xf32, #DDC> to tensor<?x?x?xf32>
-  %v3 = vector.transfer_read %3[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
+  // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DDC_ret : tensor<?x?x?xf32, #DDC>
 
-  // CHECK-NEXT:( ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 124, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ),
-  // CHECK-SAME:  ( ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ),
-  // CHECK-SAME:    ( 108, 108, 108, 108, 108, 108 ) ) )
-  %4 = sparse_tensor.convert %DCC_ret
-    : tensor<?x?x?xf32, #DCC> to tensor<?x?x?xf32>
-  %v4 = vector.transfer_read %3[%c0, %c0, %c0], %zero
-      : tensor<?x?x?xf32>, vector<6x6x6xf32>
-  vector.print %v2 : vector<6x6x6xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 6, 6, 6 )
+  // CHECK-NEXT: lvl = ( 6, 6, 6 )
+  // CHECK-NEXT: pos[1] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90,
+  // CHECK-SAME:            96, 102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162,
+  // CHECK-SAME:            168, 174, 180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: values : ( 108, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            124, 108, 108, 108, 108, 108, 124, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %DCC_ret : tensor<?x?x?xf32, #DCC>
 
   // Free the resources
   bufferization.dealloc_tensor %in3D : tensor<?x?x?xf32>
@@ -349,10 +356,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %DDC_ret : tensor<?x?x?xf32, #DDC>
   bufferization.dealloc_tensor %DCC_ret : tensor<?x?x?xf32, #DCC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %3 : tensor<?x?x?xf32>
-  bufferization.dealloc_tensor %4 : tensor<?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
index 346a143692897..f68e429a3c821 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir
@@ -10,7 +10,7 @@
 // DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
 // DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
 // DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
-// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
 // DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
 // DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
 //
@@ -83,7 +83,7 @@ func.func @conv_3d_ndhwc_dhwcf_CDCDC(%arg0: tensor<?x?x?x?x?xf32, #CDCDC>,
   return %ret : tensor<?x?x?x?x?xf32, #CDCDC>
 }
 
-func.func @entry() {
+func.func @main() {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %c3 = arith.constant 3 : index
@@ -150,93 +150,134 @@ func.func @entry() {
       : (tensor<?x?x?x?x?xf32, #CCCCC>,
          tensor<?x?x?x?x?xf32>) -> (tensor<?x?x?x?x?xf32, #CCCCC>)
 
-  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
-  %1 = sparse_tensor.convert %CCCCC_ret
-    : tensor<?x?x?x?x?xf32, #CCCCC> to tensor<?x?x?x?x?xf32>
-  %v1 = vector.transfer_read %1[%c0, %c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
-  vector.print %v1 : vector<1x6x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 1
+  // CHECK-NEXT: crd[0] : ( 0
+  // CHECK-NEXT: pos[1] : ( 0, 6
+  // CHECK-NEXT: crd[1] : ( 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[3] : ( 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96,
+  // CHECK-SAME:            102, 108, 114, 120, 126, 132, 138, 144, 150, 156, 162, 168, 174,
+  // CHECK-SAME:            180, 186, 192, 198, 204, 210, 216
+  // CHECK-NEXT: crd[3] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1,
+  // CHECK-SAME:            2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+  // CHECK-SAME:            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+  // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+  // CHECK-SAME:            53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+  // CHECK-SAME:            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+  // CHECK-SAME:            87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+  // CHECK-SAME:            103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+  // CHECK-SAME:            117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+  // CHECK-SAME:            131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+  // CHECK-SAME:            145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+  // CHECK-SAME:            159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+  // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
+  // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
+  // CHECK-SAME:            215, 216
+  // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CCCCC_ret : tensor<?x?x?x?x?xf32, #CCCCC>
 
   %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc)
       : (tensor<?x?x?x?x?xf32, #CDCDC>,
          tensor<?x?x?x?x?xf32>) -> (tensor<?x?x?x?x?xf32, #CDCDC>)
 
-  // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ),
-  // CHECK-SAME:    ( ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ),
-  // CHECK-SAME:      ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ) ) ) )
-  %2 = sparse_tensor.convert %CDCDC_ret
-    : tensor<?x?x?x?x?xf32, #CDCDC> to tensor<?x?x?x?x?xf32>
-  %v2 = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0, %c0], %zero
-      : tensor<?x?x?x?x?xf32>, vector<1x6x6x6x1xf32>
-  vector.print %v2 : vector<1x6x6x6x1xf32>
+  //
+  // CHECK:      ---- Sparse Tensor ----
+  // CHECK-NEXT: nse = 216
+  // CHECK-NEXT: dim = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: lvl = ( 1, 6, 6, 6, 1 )
+  // CHECK-NEXT: pos[0] : ( 0, 1
+  // CHECK-NEXT: crd[0] : ( 0
+  // CHECK-NEXT: pos[2] : ( 0, 6, 12, 18, 24, 30, 36
+  // CHECK-NEXT: crd[2] : ( 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
+  // CHECK-SAME:            4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5
+  // CHECK-NEXT: pos[4] : ( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+  // CHECK-SAME:            19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+  // CHECK-SAME:            36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+  // CHECK-SAME:            53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+  // CHECK-SAME:            70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+  // CHECK-SAME:            87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+  // CHECK-SAME:            103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+  // CHECK-SAME:            117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+  // CHECK-SAME:            131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+  // CHECK-SAME:            145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
+  // CHECK-SAME:            159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  // CHECK-SAME:            173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
+  // CHECK-SAME:            187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
+  // CHECK-SAME:            201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214,
+  // CHECK-SAME:            215, 216
+  // CHECK-NEXT: crd[4] : ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  // CHECK-SAME:            0, 0, 0, 0, 0, 0, 0, 0, 0
+  // CHECK-NEXT: values : ( 108, 124, 124, 124, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108, 108,
+  // CHECK-SAME:            108, 108, 108, 108, 108, 108
+  // CHECK-NEXT: ----
+  //
+  sparse_tensor.print %CDCDC_ret : tensor<?x?x?x?x?xf32, #CDCDC>
 
   // Free the resources
   bufferization.dealloc_tensor %in3D_ndhwc : tensor<?x?x?x?x?xf32>
@@ -249,8 +290,5 @@ func.func @entry() {
   bufferization.dealloc_tensor %CCCCC_ret : tensor<?x?x?x?x?xf32, #CCCCC>
   bufferization.dealloc_tensor %CDCDC_ret : tensor<?x?x?x?x?xf32, #CDCDC>
 
-  bufferization.dealloc_tensor %1 : tensor<?x?x?x?x?xf32>
-  bufferization.dealloc_tensor %2 : tensor<?x?x?x?x?xf32>
-
   return
 }
diff --git a/mlir/test/Transforms/inlining-recursive-self.mlir b/mlir/test/Transforms/inlining-recursive-self.mlir
new file mode 100644
index 0000000000000..5cc922db8e978
--- /dev/null
+++ b/mlir/test/Transforms/inlining-recursive-self.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-opt %s -inline='default-pipeline=''' | FileCheck %s
+// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=''' | FileCheck %s
+
+// CHECK-LABEL: func.func @b0
+func.func @b0() {
+  // CHECK:         call @b0
+  // CHECK-NEXT:    call @b1
+  // CHECK-NEXT:    call @b0
+  // CHECK-NEXT:    call @b1
+  // CHECK-NEXT:    call @b0
+  func.call @b0() : () -> ()
+  func.call @b1() : () -> ()
+  func.call @b0() : () -> ()
+  func.call @b1() : () -> ()
+  func.call @b0() : () -> ()
+  return
+}
+func.func @b1() {
+  func.call @b1() : () -> ()
+  func.call @b1() : () -> ()
+  return
+}
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index 29b47b862c219..6f24fbcccec95 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -6,6 +6,8 @@ set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
 include(${LLVM_COMMON_CMAKE_UTILS}/Modules/CMakePolicy.cmake
   NO_POLICY_SCOPE)
 
+include(${LLVM_COMMON_CMAKE_UTILS}/Modules/LLVMVersion.cmake)
+
 project(Runtimes C CXX ASM)
 
 list(INSERT CMAKE_MODULE_PATH 0
diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
index 88a576548e169..d6cd6aa0813e4 100644
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -149,6 +149,14 @@ def _llvm_configure_impl(repository_ctx):
         llvm_cmake,
     )
 
+    # Grab version info and merge it with the other vars
+    version = _extract_cmake_settings(
+        repository_ctx,
+        "cmake/Modules/LLVMVersion.cmake",
+    )
+    version = {k: v for k, v in version.items() if v != None}
+    vars.update(version)
+
     _write_dict_to_file(
         repository_ctx,
         filepath = "vars.bzl",